From 9e7fc16b92066438eba41be12c977e5e43c8b261 Mon Sep 17 00:00:00 2001 From: Vincent Boutour Date: Thu, 27 Nov 2025 17:26:35 +0100 Subject: [PATCH 1/2] feat(aws): Adding scheduled retry of failed events Signed-off-by: Vincent Boutour --- README.md | 25 +++++++++++---------- main.tf | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++ variables.tf | 12 +++++++++++ 3 files changed, 87 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 4cc0ec7..0f8aae2 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ This Terraform module creates the Datadog Log Lambda Forwarder infrastructure in - **Lambda Permissions**: For invocation by CloudWatch Logs, S3, SNS, and EventBridge - **Secrets Management**: Support for storing Datadog API key in Secrets Manager or SSM Parameter Store - **VPC Support**: Deploy forwarder in VPC with proxy +- **Scheduler**: For scheduled retry of stored failed events ## Usage @@ -120,17 +121,19 @@ For complete usage examples demonstrating different configuration scenarios, see ### Advanced Configuration -| Name | Description | Type | Default | -| --------------------------------- | -------------------------------- | -------- | ------- | -| dd_compression_level | Compression level (0-9) | `string` | `null` | -| dd_max_workers | Max concurrent workers | `string` | `null` | -| dd_log_level | Log level | `string` | `null` | -| dd_store_failed_events | Store failed events in S3 | `bool` | `null` | -| dd_forwarder_bucket_name | Custom S3 bucket name | `string` | `null` | -| dd_forwarder_existing_bucket_name | Existing S3 bucket name | `string` | `null` | -| dd_api_url | Custom API URL | `string` | `null` | -| dd_trace_intake_url | Custom trace intake URL | `string` | `null` | -| additional_target_lambda_arns | Additional Lambda ARNs to invoke | `string` | `null` | +| Name | Description | Type | Default | +| --------------------------------- | ------------------------------------------------------ | -------- | ------- | +| dd_compression_level | Compression level (0-9) | `string` | `null` | +| dd_max_workers | Max concurrent workers | `string` | `null` | +| dd_log_level | Log level | `string` | `null` | +| dd_store_failed_events | Store failed events in S3 | `bool` | `null` | +| dd_schedule_retry_failed_events | Periodically retry failed events (via AWS EventBridge) | `bool` | `null` | +| dd_schedule_retry_interval | Retry interval in hours for failed events | `number` | `6` | +| dd_forwarder_bucket_name | Custom S3 bucket name | `string` | `null` | +| dd_forwarder_existing_bucket_name | Existing S3 bucket name | `string` | `null` | +| dd_api_url | Custom API URL | `string` | `null` | +| dd_trace_intake_url | Custom trace intake URL | `string` | `null` | +| additional_target_lambda_arns | Additional Lambda ARNs to invoke | `string` | `null` | ### IAM Configuration diff --git a/main.tf b/main.tf index 37bf505..21dfc0c 100644 --- a/main.tf +++ b/main.tf @@ -290,3 +290,64 @@ resource "aws_cloudwatch_log_group" "forwarder_log_group" { tags = var.tags } + +# Scheduled retry + +resource "aws_iam_role" "scheduled_retry" { + count = var.dd_store_failed_events && var.dd_schedule_retry_failed_events ? 1 : 0 + + name = "${var.function_name}-${local.region}-retry" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = data.aws_partition.current.partition == "aws-cn" ? "scheduler.amazonaws.com.cn" : "scheduler.amazonaws.com" + } + } + ] + }) + + permissions_boundary = var.permissions_boundary_arn != null ? var.permissions_boundary_arn : null + + tags = var.tags +} + +resource "aws_iam_role_policy" "scheduled_retry" { + count = var.dd_store_failed_events && var.dd_schedule_retry_failed_events ? 1 : 0 + + name = "${var.function_name}-${local.region}-retry-policy" + role = aws_iam_role.scheduled_retry[0].id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = [ + "lambda:InvokeFunction", + ] + Effect = "Allow" + Resource = aws_lambda_function.forwarder.arn + }, + ] + }) +} + +resource "aws_scheduler_schedule" "scheduled_retry" { + count = var.dd_store_failed_events && var.dd_schedule_retry_failed_events ? 1 : 0 + + name = "${var.function_name}-${local.region}-retry" + description = "Retry the failed events from the Datadog Lambda Forwarder ${var.function_name}" + schedule_expression = "rate(${var.dd_schedule_retry_interval} hours)" + flexible_time_window { + mode = "OFF" + } + target { + arn = aws_lambda_function.forwarder.arn + role_arn = aws_iam_role.scheduled_retry[0].arn + input = jsonencode({ retry = true }) + } +} diff --git a/variables.tf b/variables.tf index 2625b7d..4af8e85 100644 --- a/variables.tf +++ b/variables.tf @@ -370,6 +370,18 @@ variable "dd_store_failed_events" { description = "Set to true to enable the forwarder to store events that failed to send to Datadog." } +variable "dd_schedule_retry_failed_events" { + type = bool + default = null + description = "Set to true to enable a scheduled forwarder invocation (via AWS EventBridge) to process stored failed events." +} + +variable "dd_schedule_retry_interval" { + type = number + default = 6 + description = "Interval in hours for scheduled forwarder invocation (via AWS EventBridge)." +} + variable "dd_forwarder_existing_bucket_name" { type = string default = null From 02ab3fcf80774c16dde39322ab41cbafb3b9af92 Mon Sep 17 00:00:00 2001 From: Vincent Boutour Date: Fri, 28 Nov 2025 10:46:28 +0100 Subject: [PATCH 2/2] docs(aws): Adding more details about the retry Signed-off-by: Vincent Boutour --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 0f8aae2..e73d3c0 100644 --- a/README.md +++ b/README.md @@ -276,6 +276,14 @@ module "datadog_forwarder_us_west_2" { - Your IAM role must have appropriate permissions for resources in each target region - Secrets/parameters containing the Datadog API key should exist in each target region +## Scheduled retry + +When you enable `dd_store_failed_events`, the Lambda forwarder stores any events that couldn’t be sent to Datadog in an S3 bucket. These events can be logs, metrics, or traces. They aren’t automatically re‑processed on each Lambda invocation; instead, you must trigger a [manual Lambda run](https://docs.datadoghq.com/logs/guide/forwarder/?tab=manual) to process them again. + +You can automate this re‑processing by enabling `dd_schedule_retry_failed_events` parameter, creating a scheduled Lambda invocation through [AWS EventBridge](https://docs.aws.amazon.com/lambda/latest/dg/with-eventbridge-scheduler.html). By default, the forwarder attempts re‑processing every six hours. + +Keep in mind that log events can only be submitted with [timestamps up to 18 hours in the past](https://docs.datadoghq.com/logs/log_collection/?tab=host#custom-log-forwarding); older timestamps will cause the events to be discarded. + ## Troubleshooting ### Common Issues