diff --git a/src/datadog/datadog_agent.cpp b/src/datadog/datadog_agent.cpp index e3fb611e..bd0f5c0d 100644 --- a/src/datadog/datadog_agent.cpp +++ b/src/datadog/datadog_agent.cpp @@ -219,8 +219,24 @@ void DatadogAgent::flush() { return; } + // Ideally: + /*auto [encode_result, duration] = mesure([&trace_chunks] {*/ + /* std::string body;*/ + /* msgpack_encode(body, trace_chunks);*/ + /*});*/ + std::string body; + + auto beg = std::chrono::steady_clock::now(); auto encode_result = msgpack_encode(body, trace_chunks); + auto end = std::chrono::steady_clock::now(); + + telemetry::distribution::add( + metrics::tracer::trace_chunk_serialization_duration, + std::chrono::duration_cast(end - beg).count()); + telemetry::distribution::add(metrics::tracer::trace_chunk_serialized_bytes, + static_cast(body.size())); + if (auto* error = encode_result.if_error()) { logger_->log_error(*error); return; @@ -311,11 +327,17 @@ void DatadogAgent::flush() { }; telemetry::counter::increment(metrics::tracer::api::requests); + telemetry::distribution::add(metrics::tracer::api::bytes_sent, + static_cast(body.size())); + auto post_result = http_client_->post(traces_endpoint_, std::move(set_request_headers), std::move(body), std::move(on_response), std::move(on_error), clock_().tick + request_timeout_); if (auto* error = post_result.if_error()) { + // NOTE(@dmehala): `technical` is a better kind of errors. + telemetry::counter::increment(metrics::tracer::api::errors, + {"type:network"}); logger_->log_error( error->with_prefix("Unexpected error submitting traces: ")); } diff --git a/src/datadog/telemetry/telemetry_impl.cpp b/src/datadog/telemetry/telemetry_impl.cpp index aeb33e04..6a84f242 100644 --- a/src/datadog/telemetry/telemetry_impl.cpp +++ b/src/datadog/telemetry/telemetry_impl.cpp @@ -17,6 +17,40 @@ using namespace datadog::tracing; using namespace std::chrono_literals; namespace datadog::telemetry { +namespace internal_metrics { + +/// The number of logs created with a given log level. Useful for calculating +/// impact for other features (automatic sending of logs). Levels should be one +/// of `debug`, `info`, `warn`, `error`, `critical`. +const telemetry::Counter logs_created{"logs_created", "general", true}; + +/// The number of requests sent to the api endpoint in the agent that errored, +/// tagged by the error type (e.g. `type:timeout`, `type:network`, +/// `type:status_code`) and Endpoint (`endpoint:agent`, `endpoint:agentless`). +const telemetry::Counter errors{"telemetry_api.errors", "telemetry", true}; + +/// The number of requests sent to a telemetry endpoint, regardless of success, +/// tagged by the endpoint (`endpoint:agent`, `endpoint:agentless`). +const telemetry::Counter requests{"telemetry_api.requests", "telemetry", true}; + +/// The number of responses received from the endpoint, tagged with status code +/// (`status_code:200`, `status_code:404`) and endpoint (`endpoint:agent`, +/// `endpoint:agentless`). +const telemetry::Counter responses{"telemetry_api.responses", "telemetry", + true}; + +/// The size of the payload sent to the stats endpoint in bytes, tagged by the +/// endpoint (`endpoint:agent`, `endpoint:agentless`). +const telemetry::Distribution bytes_sent{"telemetry_api.bytes", "telemetry", + true}; + +/// The time it takes to send the payload sent to the endpoint in ms, tagged by +/// the endpoint (`endpoint:agent`, `endpoint:agentless`). +const telemetry::Distribution request_duration{"telemetry_api.ms", "telemetry", + true}; + +} // namespace internal_metrics + namespace { HTTPClient::URL make_telemetry_endpoint(HTTPClient::URL url) { @@ -174,26 +208,8 @@ Telemetry::Telemetry(FinalizedConfiguration config, host_info_(get_host_info()) { // Callback for successful telemetry HTTP requests, to examine HTTP // status. - telemetry_on_response_ = [logger = logger_]( - int response_status, - const DictReader& /*response_headers*/, - std::string response_body) { - if (response_status < 200 || response_status >= 300) { - logger->log_error([&](auto& stream) { - stream << "Unexpected telemetry response status " << response_status - << " with body (if any, starts on next line):\n" - << response_body; - }); - } - }; - - // Callback for unsuccessful telemetry HTTP requests. - telemetry_on_error_ = [logger = logger_](Error error) { - logger->log_error(error.with_prefix( - "Error occurred during HTTP request for telemetry: ")); - }; - send_telemetry("app-started", app_started()); + http_client_->drain(clock_().tick + 2s); schedule_tasks(); } @@ -216,20 +232,23 @@ Telemetry::~Telemetry() { // The app-closing message is bundled with a message containing the // final metric values. send_telemetry("app-closing", app_closing()); - http_client_->drain(clock_().tick + 1s); + http_client_->drain(clock_().tick + 2s); } } Telemetry::Telemetry(Telemetry&& rhs) : config_(std::move(rhs.config_)), logger_(std::move(rhs.logger_)), - telemetry_on_response_(std::move(rhs.telemetry_on_response_)), - telemetry_on_error_(std::move(rhs.telemetry_on_error_)), telemetry_endpoint_(std::move(rhs.telemetry_endpoint_)), tracer_signature_(std::move(rhs.tracer_signature_)), http_client_(rhs.http_client_), clock_(std::move(rhs.clock_)), scheduler_(std::move(rhs.scheduler_)), + counters_(std::move(rhs.counters_)), + counters_snapshot_(std::move(rhs.counters_snapshot_)), + rates_(std::move(rhs.rates_)), + rates_snapshot_(std::move(rhs.rates_snapshot_)), + distributions_(std::move(rhs.distributions_)), seq_id_(rhs.seq_id_), config_seq_ids_(rhs.config_seq_ids_), host_info_(rhs.host_info_) { @@ -242,13 +261,17 @@ Telemetry& Telemetry::operator=(Telemetry&& rhs) { cancel_tasks(rhs.tasks_); std::swap(config_, rhs.config_); std::swap(logger_, rhs.logger_); - std::swap(telemetry_on_response_, rhs.telemetry_on_response_); - std::swap(telemetry_on_error_, rhs.telemetry_on_error_); std::swap(telemetry_endpoint_, rhs.telemetry_endpoint_); std::swap(http_client_, rhs.http_client_); std::swap(tracer_signature_, rhs.tracer_signature_); std::swap(http_client_, rhs.http_client_); std::swap(clock_, rhs.clock_); + std::swap(scheduler_, rhs.scheduler_); + std::swap(counters_, rhs.counters_); + std::swap(counters_snapshot_, rhs.counters_snapshot_); + std::swap(rates_, rhs.rates_); + std::swap(rates_snapshot_, rhs.rates_snapshot_); + std::swap(distributions_, rhs.distributions_); std::swap(seq_id_, rhs.seq_id_); std::swap(config_seq_ids_, rhs.config_seq_ids_); std::swap(host_info_, rhs.host_info_); @@ -259,16 +282,19 @@ Telemetry& Telemetry::operator=(Telemetry&& rhs) { void Telemetry::log_error(std::string message) { if (!config_.report_logs) return; + increment_counter(internal_metrics::logs_created, {"level:error"}); log(std::move(message), LogLevel::ERROR); } void Telemetry::log_error(std::string message, std::string stacktrace) { if (!config_.report_logs) return; + increment_counter(internal_metrics::logs_created, {"level:error"}); log(std::move(message), LogLevel::ERROR, stacktrace); } void Telemetry::log_warning(std::string message) { if (!config_.report_logs) return; + increment_counter(internal_metrics::logs_created, {"level:warning"}); log(std::move(message), LogLevel::WARNING); } @@ -293,10 +319,55 @@ void Telemetry::send_telemetry(StringView request_type, std::string payload) { } }; - auto post_result = http_client_->post( - telemetry_endpoint_, set_telemetry_headers, std::move(payload), - telemetry_on_response_, telemetry_on_error_, clock_().tick + 5s); + auto telemetry_on_response = [this, logger = logger_]( + int response_status, + const DictReader& /*response_headers*/, + std::string response_body) { + if (response_status >= 500) { + increment_counter(internal_metrics::responses, + {"status_code:5xx", "endpoint:agent"}); + } else if (response_status >= 400) { + increment_counter(internal_metrics::responses, + {"status_code:4xx", "endpoint:agent"}); + } else if (response_status >= 300) { + increment_counter(internal_metrics::responses, + {"status_code:3xx", "endpoint:agent"}); + } else if (response_status >= 200) { + increment_counter(internal_metrics::responses, + {"status_code:2xx", "endpoint:agent"}); + } else if (response_status >= 100) { + increment_counter(internal_metrics::responses, + {"status_code:1xx", "endpoint:agent"}); + } + + if (response_status < 200 || response_status >= 300) { + logger->log_error([&](auto& stream) { + stream << "Unexpected telemetry response status " << response_status + << " with body (if any, starts on next line):\n" + << response_body; + }); + } + }; + + // Callback for unsuccessful telemetry HTTP requests. + auto telemetry_on_error = [this, logger = logger_](Error error) { + increment_counter(internal_metrics::errors, + {"type:network", "endpoint:agent"}); + logger->log_error(error.with_prefix( + "Error occurred during HTTP request for telemetry: ")); + }; + + increment_counter(internal_metrics::requests, {"endpoint:agent"}); + add_datapoint(internal_metrics::bytes_sent, {"endpoint:agent"}, + payload.size()); + + auto post_result = + http_client_->post(telemetry_endpoint_, set_telemetry_headers, + std::move(payload), std::move(telemetry_on_response), + std::move(telemetry_on_error), clock_().tick + 5s); if (auto* error = post_result.if_error()) { + increment_counter(internal_metrics::errors, + {"type:network", "endpoint:agent"}); logger_->log_error( error->with_prefix("Unexpected error submitting telemetry event: ")); } diff --git a/src/datadog/telemetry/telemetry_impl.h b/src/datadog/telemetry/telemetry_impl.h index 03d50017..1c8ef2dc 100644 --- a/src/datadog/telemetry/telemetry_impl.h +++ b/src/datadog/telemetry/telemetry_impl.h @@ -34,8 +34,6 @@ class Telemetry final { /// Shared pointer to the user logger instance. std::shared_ptr logger_; std::vector tasks_; - tracing::HTTPClient::ResponseHandler telemetry_on_response_; - tracing::HTTPClient::ErrorHandler telemetry_on_error_; tracing::HTTPClient::URL telemetry_endpoint_; tracing::TracerSignature tracer_signature_; std::shared_ptr http_client_; diff --git a/src/datadog/telemetry_metrics.cpp b/src/datadog/telemetry_metrics.cpp index ccf56db9..ab0363e6 100644 --- a/src/datadog/telemetry_metrics.cpp +++ b/src/datadog/telemetry_metrics.cpp @@ -4,6 +4,7 @@ namespace datadog::tracing::metrics { namespace tracer { const telemetry::Counter spans_created = {"spans_created", "tracers", true}; +const telemetry::Counter spans_dropped = {"spans_dropped", "tracers", true}; const telemetry::Counter spans_finished = {"spans_finished", "tracers", true}; const telemetry::Counter trace_segments_created = {"trace_segments_created", @@ -11,6 +12,28 @@ const telemetry::Counter trace_segments_created = {"trace_segments_created", const telemetry::Counter trace_segments_closed = {"trace_segments_closed", "tracers", true}; + +const telemetry::Distribution trace_chunk_size = {"trace_chunk_size", "tracers", + true}; + +const telemetry::Distribution trace_chunk_serialized_bytes = { + "trace_chunk_serialization.bytes", "tracers", true}; + +const telemetry::Distribution trace_chunk_serialization_duration = { + "trace_chunk_serialization.ms", "tracers", true}; + +const telemetry::Counter trace_chunks_enqueued = {"trace_chunks_enqueued", + "tracers", true}; + +const telemetry::Counter trace_chunks_enqueued_for_serialization = { + "trace_chunks_enqueued_for_serialization", "tracers", true}; + +const telemetry::Counter trace_chunks_dropped = {"trace_chunks_dropped", + "tracers", true}; + +const telemetry::Counter trace_chunks_sent = {"trace_chunks_sent", "tracers", + true}; + const telemetry::Counter context_header_truncated = { "context_header.truncated", "tracers", @@ -20,8 +43,21 @@ const telemetry::Counter context_header_truncated = { namespace api { const telemetry::Counter requests = {"trace_api.requests", "tracers", true}; const telemetry::Counter responses = {"trace_api.responses", "tracers", true}; +const telemetry::Distribution bytes_sent = {"trace_api.bytes", "tracers", true}; +const telemetry::Distribution request_duration = {"trace_api.ms", "tracers", + true}; const telemetry::Counter errors = {"trace_api.errors", "tracers", true}; } // namespace api + +namespace trace_context { +const telemetry::Counter injected = {"context_header_style.injected", "tracers", + true}; +const telemetry::Counter extracted = {"context_header_style.extracted", + "tracers", true}; +const telemetry::Counter truncated = {"context_header.truncated", "tracers", + true}; +} // namespace trace_context + } // namespace tracer } // namespace datadog::tracing::metrics diff --git a/src/datadog/telemetry_metrics.h b/src/datadog/telemetry_metrics.h index b3461475..df0406b4 100644 --- a/src/datadog/telemetry_metrics.h +++ b/src/datadog/telemetry_metrics.h @@ -11,11 +11,52 @@ namespace tracer { /// `integration_name:opentracing`). extern const telemetry::Counter spans_created; +/// The number of spans dropped and the reason for being dropped, for example +/// `reason:p0_drop` (the span was part of a p0 trace that was droped by the +/// tracer), `reason:overfull_buffer` (the local buffer was full, and the span +/// had to be dropped), `reason:serialization_error` (there was an error +/// serializing the span and it had to be dropped) +extern const telemetry::Counter spans_dropped; + /// The number of spans finished, optionally (if implementation allows) tagged /// manual API (`integration_name:datadog`, `integration_name:otel` or /// `integration_name:opentracing`). extern const telemetry::Counter spans_finished; +/// The number of spans in the trace chunk when it is enqueued. +extern const telemetry::Distribution trace_chunk_size; + +/// The size in bytes of the serialized trace chunk. +extern const telemetry::Distribution trace_chunk_serialized_bytes; + +/// The time it takes to serialize a trace chunk. +extern const telemetry::Distribution trace_chunk_serialization_duration; + +/// The number of times a trace chunk is enqueued for sampling/serialization. In +/// partial-flush scenarios, multiple trace chunks may be enqueued per trace +/// segment/local trace. +extern const telemetry::Counter trace_chunks_enqueued; + +/// The number of trace chunks kept for serialization. Excludes single-span +/// sampling spans. Tagged by one of `reason:p0_keep` (the trace was a p0 trace +/// that was kept for sending to the agent) or `reason:default` - The tracer is +/// not dropping p0 spans, so the span was enqueued 'by default' for sending to +/// the trace-agent). +extern const telemetry::Counter trace_chunks_enqueued_for_serialization; + +/// The number of trace chunks dropped prior to serialization, tagged by reason. +/// Includes traces which are dropped due to errors, overfull buffers, as well +/// as due to sampling decision. For example `reason:p0_drop` (the span a p0 +/// trace that was droped by the tracer), `reason:overfull_buffer` (the local +/// buffer was full, and the trace chunk had to be dropped), +/// `reason:serialization_error` (there was an error serializing the trace and +/// it had to be dropped). +extern const telemetry::Counter trace_chunks_dropped; + +/// The number of trace chunks attempted to be sent to the backend, regardless +/// of response. +extern const telemetry::Counter trace_chunks_sent; + /// The number of trace segments (local traces) created, tagged with /// new/continued depending on whether this is a new trace (no distributed /// context information) or continued (has distributed context). @@ -25,11 +66,6 @@ extern const telemetry::Counter trace_segments_created; /// scenarios, trace_segments_closed == trace_chunks_enqueued. extern const telemetry::Counter trace_segments_closed; -/// The number of times a context propagation header is truncated, tagged by the -/// reason for truncation (`truncation_reason:baggage_item_count_exceeded`, -/// `truncation_reason:baggage_byte_count_exceeded`). -extern const telemetry::Counter context_header_truncated; - namespace api { /// The number of requests sent to the trace endpoint in the agent, regardless @@ -42,12 +78,39 @@ extern const telemetry::Counter requests; /// responses. extern const telemetry::Counter responses; +/// The size of the payload sent to the endpoint in bytes. +extern const telemetry::Distribution bytes_sent; + +/// The time it takes to flush the trace payload to the agent. Note that this is +/// not the per trace time, this is the per payload time. +extern const telemetry::Distribution request_duration; + /// The number of requests sent to the trace endpoint in the agent that errored, /// tagged by the error type (e.g. `type:timeout`, `type:network`, /// `type:status_code`). extern const telemetry::Counter errors; } // namespace api + +namespace trace_context { + +/// The number of times distributed context is injected into an outgoing span, +/// tagged by header style (`header_style:tracecontext`, `header_style:datadog`, +/// `header_style:b3multi`, `header_style:b3single`, `header_style:baggage`) +extern const telemetry::Counter injected; + +/// The number of times distributed context is successfully extracted from an +/// outgoing span, tagged by header style (`header_style:tracecontext`, +/// `header_style:datadog`, `header_style:b3multi`, `header_style:b3single`, +/// `header_style:baggage`) +extern const telemetry::Counter extracted; + +/// The number of times a context propagation header is truncated, tagged by the +/// reason for truncation (`truncation_reason:baggage_item_count_exceeded`, +/// `truncation_reason:baggage_byte_count_exceeded`) +extern const telemetry::Counter truncated; +} // namespace trace_context + } // namespace tracer } // namespace datadog::tracing::metrics diff --git a/src/datadog/trace_segment.cpp b/src/datadog/trace_segment.cpp index 44d0d1ee..b7f74c0f 100644 --- a/src/datadog/trace_segment.cpp +++ b/src/datadog/trace_segment.cpp @@ -140,7 +140,8 @@ Optional TraceSegment::sampling_decision() const { Logger& TraceSegment::logger() const { return *logger_; } void TraceSegment::register_span(std::unique_ptr span) { - telemetry::counter::increment(metrics::tracer::spans_created); + telemetry::counter::increment(metrics::tracer::spans_created, + {"integration_name:datadog"}); std::lock_guard lock(mutex_); assert(spans_.empty() || num_finished_spans_ < spans_.size()); @@ -149,7 +150,8 @@ void TraceSegment::register_span(std::unique_ptr span) { void TraceSegment::span_finished() { { - telemetry::counter::increment(metrics::tracer::spans_finished); + telemetry::counter::increment(metrics::tracer::spans_finished, + {"integration_name:datadog"}); std::lock_guard lock(mutex_); ++num_finished_spans_; assert(num_finished_spans_ <= spans_.size()); @@ -157,15 +159,20 @@ void TraceSegment::span_finished() { return; } } + + telemetry::counter::increment(metrics::tracer::trace_chunks_enqueued); + // We don't need the lock anymore. There's nobody left to call our methods. // On the other hand, there's nobody left to contend for the mutex, so it // doesn't make any difference. make_sampling_decision_if_null(); assert(sampling_decision_); - // All of our spans are finished. Run the span sampler, finalize the spans, + // All of our spans are finished. Run the span sampler, finalize the spans, // and then send the spans to the collector. if (sampling_decision_->priority <= 0) { + telemetry::counter::increment(metrics::tracer::trace_chunks_dropped, + {"reason:p0_drop"}); // Span sampling happens when the trace is dropped. for (const auto& span_ptr : spans_) { SpanData& span = *span_ptr; @@ -175,8 +182,11 @@ void TraceSegment::span_finished() { } const SamplingDecision decision = rule->decide(span); if (decision.priority <= 0) { + telemetry::counter::increment(metrics::tracer::spans_dropped, + {"reason:p0_drop"}); continue; } + span.numeric_tags[tags::internal::span_sampling_mechanism] = *decision.mechanism; span.numeric_tags[tags::internal::span_sampling_rule_rate] = @@ -233,6 +243,10 @@ void TraceSegment::span_finished() { } if (config_manager_->report_traces()) { + telemetry::distribution::add(metrics::tracer::trace_chunk_size, + spans_.size()); + + telemetry::counter::increment(metrics::tracer::trace_chunks_sent); const auto result = collector_->send(std::move(spans_), trace_sampler_); if (auto* error = result.if_error()) { logger_->log_error( @@ -336,6 +350,9 @@ bool TraceSegment::inject(DictWriter& writer, const SpanData& span, } inject_trace_tags(writer, trace_tags, tags_header_max_size_, spans_.front()->tags, *logger_); + + telemetry::counter::increment(metrics::tracer::trace_context::injected, + {"header_style:datadog"}); break; case PropagationStyle::B3: if (span.trace_id.high) { @@ -350,6 +367,8 @@ bool TraceSegment::inject(DictWriter& writer, const SpanData& span, } inject_trace_tags(writer, trace_tags, tags_header_max_size_, spans_.front()->tags, *logger_); + telemetry::counter::increment(metrics::tracer::trace_context::injected, + {"header_style:b3multi"}); break; case PropagationStyle::W3C: writer.set( @@ -360,6 +379,8 @@ bool TraceSegment::inject(DictWriter& writer, const SpanData& span, encode_tracestate(span.span_id, sampling_priority, origin_, trace_tags, additional_datadog_w3c_tracestate_, additional_w3c_tracestate_)); + telemetry::counter::increment(metrics::tracer::trace_context::injected, + {"header_style:tracecontext"}); break; default: break; diff --git a/src/datadog/tracer.cpp b/src/datadog/tracer.cpp index 1e2e9e17..e0f34a1e 100644 --- a/src/datadog/tracer.cpp +++ b/src/datadog/tracer.cpp @@ -213,18 +213,23 @@ Expected Tracer::extract_span(const DictReader& reader, for (const auto style : extraction_styles_) { using Extractor = decltype(&extract_datadog); // function pointer Extractor extract; + std::string extracted_tag; ///< for telemetry switch (style) { case PropagationStyle::DATADOG: extract = &extract_datadog; + extracted_tag = "header_style:datadog"; break; case PropagationStyle::B3: extract = &extract_b3; + extracted_tag = "header_style:b3multi"; break; case PropagationStyle::W3C: extract = &extract_w3c; + extracted_tag = "header_style:tracecontext"; break; default: extract = &extract_none; + extracted_tag = "header_style:none"; } audited_reader.entries_found.clear(); auto data = extract(audited_reader, span_data->tags, *logger_); @@ -233,6 +238,9 @@ Expected Tracer::extract_span(const DictReader& reader, extraction_error_prefix(style, audited_reader.entries_found)); } + telemetry::counter::increment(metrics::tracer::trace_context::extracted, + {extracted_tag}); + if (!first_style_with_trace_id && data->trace_id.has_value()) { first_style_with_trace_id = style; } @@ -419,7 +427,12 @@ Expected Tracer::extract_baggage( return Baggage::Error{Baggage::Error::DISABLED}; } - return Baggage::extract(reader); + auto maybe_baggage = Baggage::extract(reader); + if (maybe_baggage) { + telemetry::counter::increment(metrics::tracer::trace_context::extracted, + {"header_style:baggage"}); + } + return maybe_baggage; } Baggage Tracer::extract_or_create_baggage(const DictReader& reader) { @@ -444,12 +457,15 @@ Expected Tracer::inject(const Baggage& baggage, DictWriter& writer) { if (err->code == Error::Code::BAGGAGE_MAXIMUM_BYTES_REACHED) { telemetry::counter::increment( - metrics::tracer::context_header_truncated, + metrics::tracer::trace_context::truncated, {"truncation_reason:baggage_byte_count_exceeded"}); } else if (err->code == Error::Code::BAGGAGE_MAXIMUM_ITEMS_REACHED) { telemetry::counter::increment( - metrics::tracer::context_header_truncated, + metrics::tracer::trace_context::truncated, {"truncation_reason:baggage_item_count_exceeded"}); + } else { + telemetry::counter::increment(metrics::tracer::trace_context::injected, + {"header_style:baggage"}); } } diff --git a/test/telemetry/test_telemetry.cpp b/test/telemetry/test_telemetry.cpp index b34688aa..383be17f 100644 --- a/test/telemetry/test_telemetry.cpp +++ b/test/telemetry/test_telemetry.cpp @@ -33,6 +33,15 @@ bool is_valid_telemetry_payload(const nlohmann::json& json) { json.contains("/host"_json_pointer); } +std::optional find_payload(const nlohmann::json& messages, + std::string_view kind) { + for (const auto& m : messages) { + if (m["request_type"].get() == kind) return m; + } + + return std::nullopt; +}; + struct FakeEventScheduler : public EventScheduler { size_t count_tasks = 0; std::function heartbeat_callback = nullptr; @@ -287,9 +296,11 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry lifecycle") { auto message_batch = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch) == true); - REQUIRE(message_batch["payload"].size() == 1); - auto heartbeat = message_batch["payload"][0]; - REQUIRE(heartbeat["request_type"] == "app-closing"); + REQUIRE(message_batch["payload"].size() >= 1); + + auto app_closing_payload = + find_payload(message_batch["payload"], "app-closing"); + REQUIRE(app_closing_payload.has_value()); } } @@ -321,9 +332,9 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { auto heartbeat_message = client->request_body; auto message_batch = nlohmann::json::parse(heartbeat_message); REQUIRE(is_valid_telemetry_payload(message_batch) == true); - REQUIRE(message_batch["payload"].size() == 1); - auto heartbeat = message_batch["payload"][0]; - REQUIRE(heartbeat["request_type"] == "app-heartbeat"); + REQUIRE(message_batch["payload"].size() >= 1); + + REQUIRE(find_payload(message_batch["payload"], "app-heartbeat")); } SECTION("metrics reporting") { @@ -355,13 +366,15 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { auto message_batch = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch) == true); - REQUIRE(message_batch["payload"].size() == 2); - auto generate_metrics = message_batch["payload"][1]; - REQUIRE(generate_metrics["request_type"] == "generate-metrics"); - auto payload = generate_metrics["payload"]; + REQUIRE(message_batch["payload"].size() >= 2); + + auto generate_metrics = + find_payload(message_batch["payload"], "generate-metrics"); + REQUIRE(generate_metrics.has_value()); + auto payload = (*generate_metrics)["payload"]; auto series = payload["series"]; - REQUIRE(series.size() == 2); + REQUIRE(series.size() >= 2); const auto expected_metrics = nlohmann::json::parse(R"( [ @@ -390,10 +403,12 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { )"); for (const auto& s : series) { - if (s.contains("tags")) { - CHECK(s == expected_metrics[0]); - } else { - CHECK(s == expected_metrics[1]); + if (s["metric"] == "my_counter") { + if (s.contains("tags")) { + CHECK(s == expected_metrics[0]); + } else { + CHECK(s == expected_metrics[1]); + } } } @@ -404,10 +419,9 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { auto message_batch2 = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch2) == true); - REQUIRE(message_batch2["payload"].size() == 1); + REQUIRE(message_batch2["payload"].size() >= 1); - auto payload2 = message_batch["payload"][0]; - CHECK(payload2["request_type"] == "app-heartbeat"); + CHECK(find_payload(message_batch["payload"], "app-heartbeat")); } SECTION("counters can't go below zero") { @@ -422,17 +436,17 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { auto message_batch = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch) == true); - REQUIRE(message_batch["payload"].size() == 2); + REQUIRE(message_batch["payload"].size() >= 2); - auto generate_metrics = message_batch["payload"][1]; - REQUIRE(generate_metrics["request_type"] == "generate-metrics"); - auto payload = generate_metrics["payload"]; + auto generate_metrics = + find_payload(message_batch["payload"], "generate-metrics"); + REQUIRE(generate_metrics); + auto payload = (*generate_metrics)["payload"]; auto series = payload["series"]; - REQUIRE(series.size() == 1); + REQUIRE(series.size() >= 1); - const auto expected_metrics = nlohmann::json::parse(R"( - [ + const auto expected_metric = nlohmann::json::parse(R"( { "common": true, "metric": "positive_counter", @@ -442,9 +456,13 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { ], "type": "count" } - ] )"); - CHECK(series == expected_metrics); + + for (const auto& s : series) { + if (s["metric"] == "positive_counter") { + CHECK(s == expected_metric); + } + } } SECTION("rate") { @@ -468,13 +486,14 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { auto message_batch = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch) == true); - REQUIRE(message_batch["payload"].size() == 2); - auto generate_metrics = message_batch["payload"][1]; - REQUIRE(generate_metrics["request_type"] == "generate-metrics"); - auto payload = generate_metrics["payload"]; + REQUIRE(message_batch["payload"].size() >= 2); + auto generate_metrics = + find_payload(message_batch["payload"], "generate-metrics"); + REQUIRE(generate_metrics); + auto payload = (*generate_metrics)["payload"]; auto series = payload["series"]; - REQUIRE(series.size() == 2); + REQUIRE(series.size() >= 2); const auto expected_metrics = nlohmann::json::parse(R"( [ @@ -502,10 +521,12 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { )"); for (const auto& s : series) { - if (s.contains("tags")) { - CHECK(s == expected_metrics[0]); - } else { - CHECK(s == expected_metrics[1]); + if (s["metric"] == "request") { + if (s.contains("tags")) { + CHECK(s == expected_metrics[0]); + } else { + CHECK(s == expected_metrics[1]); + } } } @@ -516,10 +537,8 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { auto message_batch2 = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch2) == true); - REQUIRE(message_batch2["payload"].size() == 1); - - auto payload2 = message_batch["payload"][0]; - CHECK(payload2["request_type"] == "app-heartbeat"); + REQUIRE(message_batch2["payload"].size() >= 1); + CHECK(find_payload(message_batch["payload"], "app-heartbeat")); } SECTION("distribution") { @@ -552,7 +571,7 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { REQUIRE(distribution_message["request_type"] == "distributions"); auto distribution_series = distribution_message["payload"]["series"]; - REQUIRE(distribution_series.size() == 3); + REQUIRE(distribution_series.size() >= 3); const auto expected_series = nlohmann::json::parse(R"([ { @@ -585,8 +604,6 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { } } else if (s["metric"] == "request_size") { CHECK(s == expected_series[1]); - } else { - FAIL(); } } @@ -597,10 +614,8 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { auto message_batch2 = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch2) == true); - REQUIRE(message_batch2["payload"].size() == 1); - - auto payload = message_batch["payload"][0]; - CHECK(payload["request_type"] == "app-heartbeat"); + REQUIRE(message_batch2["payload"].size() >= 1); + CHECK(find_payload(message_batch["payload"], "app-heartbeat")); } SECTION("dtor sends metrics and distributions") { @@ -626,7 +641,7 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { const auto& req_type = payload["request_type"]; if (req_type == "generate-metrics") { const auto& metrics_series = payload["payload"]["series"]; - REQUIRE(metrics_series.size() == 2); + REQUIRE(metrics_series.size() >= 2); for (const auto& s : metrics_series) { if (s["metric"] == "my_counter") { @@ -651,15 +666,12 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { } )"); CHECK(s == expected_rate); - } else { - FAIL("unexpected metrics name, got " << s["metric"]); } } } else if (req_type == "distributions") { const auto& distribution_series = payload["payload"]["series"]; - REQUIRE(distribution_series.size() == 1); + REQUIRE(distribution_series.size() >= 1); - const auto& d0 = distribution_series[0]; const auto expected_d0 = nlohmann::json::parse(R"( { "common":false, @@ -668,7 +680,12 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { "points": [128] } )"); - CHECK(d0 == expected_d0); + + for (const auto& d : distribution_series) { + if (d["metric"] == "response_time") { + CHECK(d == expected_d0); + }; + } } } } @@ -725,12 +742,12 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { auto message_batch = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch)); - REQUIRE(message_batch["payload"].size() == 2); + REQUIRE(message_batch["payload"].size() >= 2); - auto logs_message = message_batch["payload"][1]; - REQUIRE(logs_message["request_type"] == "logs"); + auto logs_message = find_payload(message_batch["payload"], "logs"); + REQUIRE(logs_message); - auto logs_payload = logs_message["payload"]["logs"]; + auto logs_payload = (*logs_message)["payload"]["logs"]; REQUIRE(logs_payload.size() == 1); CHECK(logs_payload[0]["level"] == test_case.expected_log_level); CHECK(logs_payload[0]["message"] == test_case.input); @@ -749,10 +766,8 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { auto message_batch2 = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch2) == true); - REQUIRE(message_batch2["payload"].size() == 1); - - auto payload2 = message_batch["payload"][0]; - CHECK(payload2["request_type"] == "app-heartbeat"); + REQUIRE(message_batch2["payload"].size() >= 1); + CHECK(find_payload(message_batch["payload"], "app-heartbeat")); } SECTION("dtor sends logs in `app-closing` message") { @@ -765,12 +780,12 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry API") { auto message_batch = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch)); - REQUIRE(message_batch["payload"].size() == 2); + REQUIRE(message_batch["payload"].size() >= 2); - auto logs_message = message_batch["payload"][1]; - REQUIRE(logs_message["request_type"] == "logs"); + auto logs_message = find_payload(message_batch["payload"], "logs"); + REQUIRE(logs_message); - auto logs_payload = logs_message["payload"]["logs"]; + auto logs_payload = (*logs_message)["payload"]["logs"]; REQUIRE(logs_payload.size() == 1); CHECK(logs_payload[0]["level"] == "WARNING"); CHECK(logs_payload[0]["message"] == "Be careful!"); @@ -842,7 +857,7 @@ TELEMETRY_IMPLEMENTATION_TEST("Tracer telemetry configuration") { auto message_batch = nlohmann::json::parse(client->request_body); REQUIRE(is_valid_telemetry_payload(message_batch)); - REQUIRE(message_batch["payload"].size() == 1); - CHECK(message_batch["payload"][0]["request_type"] == "app-heartbeat"); + REQUIRE(message_batch["payload"].size() >= 1); + CHECK(find_payload(message_batch["payload"], "app-heartbeat")); } }