diff --git a/.github/workflows/aws_unit_test.yml b/.github/workflows/aws_unit_test.yml index 2f9556a7..d1e5754e 100644 --- a/.github/workflows/aws_unit_test.yml +++ b/.github/workflows/aws_unit_test.yml @@ -18,6 +18,7 @@ jobs: AWS_DEFAULT_REGION: us-east-1 DD_API_KEY: "11111111111111111111111111111111" DD_ADDITIONAL_TARGET_LAMBDAS: "ironmaiden,megadeth" + DD_STORE_FAILED_EVENTS: "true" run: | pip install boto3 mock approvaltests python -m unittest discover ./aws/logs_monitoring/ diff --git a/aws/logs_monitoring/README.md b/aws/logs_monitoring/README.md index f1667afc..493385d7 100644 --- a/aws/logs_monitoring/README.md +++ b/aws/logs_monitoring/README.md @@ -95,7 +95,7 @@ If you can't install the Forwarder using the provided CloudFormation template, y 8. Set environment variable `DD_STORE_FAILED_EVENTS` to `true` to enable the forwarder to also store event data in the S3 bucket. In case of exceptions when sending logs, metrics or traces to intake, the forwarder will store relevant data in the S3 bucket. On custom invocations i.e. on receiving an event with the `retry` keyword set to a non empty string (which can be manually triggered - see below), the forwarder will retry sending the stored events. When successful it will clear up the storage in the bucket. ```bash -aws lambda invoke --function-name --payload '{"retry":"true"}' out +aws lambda invoke --function-name --payload '{"retry":"true"}' --cli-binary-format raw-in-base64-out --log-type Tail /dev/stdout ```
diff --git a/aws/logs_monitoring/forwarder.py b/aws/logs_monitoring/forwarder.py index 10c8033c..0188b2af 100644 --- a/aws/logs_monitoring/forwarder.py +++ b/aws/logs_monitoring/forwarder.py @@ -128,6 +128,9 @@ def _forward_logs(self, logs, key=None): if DD_STORE_FAILED_EVENTS and len(failed_logs) > 0 and not key: self.storage.store_data(RetryPrefix.LOGS, failed_logs) + if len(failed_logs) > 0: + send_event_metric("logs_failed", failed_logs) + send_event_metric("logs_forwarded", len(logs_to_forward) - len(failed_logs)) def _forward_metrics(self, metrics, key=None): @@ -156,6 +159,9 @@ def _forward_metrics(self, metrics, key=None): if DD_STORE_FAILED_EVENTS and len(failed_metrics) > 0 and not key: self.storage.store_data(RetryPrefix.METRICS, failed_metrics) + if len(failed_metrics) > 0: + send_event_metric("metrics_failed", failed_metrics) + send_event_metric("metrics_forwarded", len(metrics) - len(failed_metrics)) def _forward_traces(self, traces, key=None): diff --git a/aws/logs_monitoring/lambda_function.py b/aws/logs_monitoring/lambda_function.py index 9ec2eb45..154ff6c8 100644 --- a/aws/logs_monitoring/lambda_function.py +++ b/aws/logs_monitoring/lambda_function.py @@ -9,7 +9,6 @@ from hashlib import sha1 import boto3 -import requests from datadog import api from datadog_lambda.wrapper import datadog_lambda_wrapper @@ -22,8 +21,9 @@ DD_API_URL, DD_FORWARDER_VERSION, DD_RETRY_KEYWORD, - DD_SITE, DD_SKIP_SSL_VALIDATION, + DD_STORE_FAILED_EVENTS, + is_api_key_valid, ) from steps.enrichment import enrich from steps.parsing import parse @@ -33,42 +33,12 @@ logger = logging.getLogger() logger.setLevel(logging.getLevelName(os.environ.get("DD_LOG_LEVEL", "INFO").upper())) -# DD_API_KEY must be set -if DD_API_KEY == "" or DD_API_KEY == "": - raise Exception( - "Missing Datadog API key. Set DD_API_KEY environment variable. " - "See: https://docs.datadoghq.com/serverless/forwarder/" - ) -# Check if the API key is the correct number of characters -if len(DD_API_KEY) != 32: - raise Exception( - f""" - Invalid Datadog API key format. Expected 32 characters, received {len(DD_API_KEY)}. - Verify your API key at https://app.{DD_SITE}/organization-settings/api-keys - """ - ) -# Validate the API key -logger.debug("Validating the Datadog API key") -with requests.Session() as s: - retries = requests.adapters.Retry( - total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504] +if not is_api_key_valid() and not DD_STORE_FAILED_EVENTS: + raise Exception( + "Failed to check if API Key is valid and no storage of failed events, aborting." ) - s.mount("http://", requests.adapters.HTTPAdapter(max_retries=retries)) - s.mount("https://", requests.adapters.HTTPAdapter(max_retries=retries)) - - validation_res = s.get( - "{}/api/v1/validate?api_key={}".format(DD_API_URL, DD_API_KEY), - verify=(not DD_SKIP_SSL_VALIDATION), - timeout=10, - ) - if not validation_res.ok: - raise Exception( - f"Datadog API key validation failed (HTTP {validation_res.status_code}). " - f"Verify your API key is correct and DD_SITE matches your Datadog account region (current: {DD_SITE}). " - "See: https://docs.datadoghq.com/getting_started/site/" - ) # Force the layer to use the exact same API key and host as the forwarder api._api_key = DD_API_KEY diff --git a/aws/logs_monitoring/logs/datadog_http_client.py b/aws/logs_monitoring/logs/datadog_http_client.py index 0d1c8a27..b15c7421 100644 --- a/aws/logs_monitoring/logs/datadog_http_client.py +++ b/aws/logs_monitoring/logs/datadog_http_client.py @@ -68,6 +68,7 @@ def __init__( self._session = None self._ssl_validation = not skip_ssl_validation self._futures = [] + if logger.isEnabledFor(logging.DEBUG): logger.debug( f"Initialized http client for logs intake: " diff --git a/aws/logs_monitoring/settings.py b/aws/logs_monitoring/settings.py index 7d63efa6..4d9aa072 100644 --- a/aws/logs_monitoring/settings.py +++ b/aws/logs_monitoring/settings.py @@ -10,6 +10,7 @@ import boto3 import botocore.config +import requests logger = logging.getLogger() logger.setLevel(logging.getLevelName(os.environ.get("DD_LOG_LEVEL", "INFO").upper())) @@ -230,6 +231,49 @@ def __init__(self, name, pattern, placeholder, enabled=True): DD_API_KEY = DD_API_KEY.strip() os.environ["DD_API_KEY"] = DD_API_KEY + +def is_api_key_valid(): + # DD_API_KEY must be set + if DD_API_KEY == "" or DD_API_KEY == "": + raise Exception( + "Missing Datadog API key. Set DD_API_KEY environment variable. " + "See: https://docs.datadoghq.com/serverless/forwarder/" + ) + + # Check if the API key is the correct number of characters + if len(DD_API_KEY) != 32: + raise Exception( + f""" + Invalid Datadog API key format. Expected 32 characters, received {len(DD_API_KEY)}. + Verify your API key at https://app.{DD_SITE}/organization-settings/api-keys + """ + ) + + # Validate the API key + logger.debug("Validating the Datadog API key") + + with requests.Session() as s: + retries = requests.adapters.Retry( + total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504] + ) + + s.mount("http://", requests.adapters.HTTPAdapter(max_retries=retries)) + s.mount("https://", requests.adapters.HTTPAdapter(max_retries=retries)) + + validation_res = s.get( + "{}/api/v1/validate?api_key={}".format(DD_API_URL, DD_API_KEY), + verify=(not DD_SKIP_SSL_VALIDATION), + timeout=10, + ) + if not validation_res.ok: + logger.error( + f"Datadog API key validation failed (HTTP {validation_res.status_code}). Verify your API key is correct and DD_SITE matches your Datadog account region (current: {DD_SITE}). See: https://docs.datadoghq.com/getting_started/site/" + ) + return False + + return True + + # DD_MULTILINE_LOG_REGEX_PATTERN: Multiline Log Regular Expression Pattern DD_MULTILINE_LOG_REGEX_PATTERN = get_env_var( "DD_MULTILINE_LOG_REGEX_PATTERN", default=None diff --git a/aws/logs_monitoring/tests/run_unit_tests.sh b/aws/logs_monitoring/tests/run_unit_tests.sh index 3572e511..87ad0aff 100755 --- a/aws/logs_monitoring/tests/run_unit_tests.sh +++ b/aws/logs_monitoring/tests/run_unit_tests.sh @@ -2,5 +2,6 @@ export DD_API_KEY=11111111111111111111111111111111 export DD_ADDITIONAL_TARGET_LAMBDAS=ironmaiden,megadeth +export DD_STORE_FAILED_EVENTS="true" export DD_S3_BUCKET_NAME=dd-s3-bucket python3 -m unittest discover . diff --git a/aws/logs_monitoring/tools/integration_tests/docker-compose.yml b/aws/logs_monitoring/tools/integration_tests/docker-compose.yml index acf78f48..96754366 100644 --- a/aws/logs_monitoring/tools/integration_tests/docker-compose.yml +++ b/aws/logs_monitoring/tools/integration_tests/docker-compose.yml @@ -27,50 +27,54 @@ services: AWS_SECURITY_TOKEN: "${AWS_SECURITY_TOKEN}" AWS_SESSION_TOKEN: "${AWS_SESSION_TOKEN}" AWS_DEFAULT_REGION: us-east-1 - DD_LOG_LEVEL: ${LOG_LEVEL:-info} + DD_ADDITIONAL_TARGET_LAMBDAS: "${EXTERNAL_LAMBDAS}" DD_API_KEY: abcdefghijklmnopqrstuvwxyz012345 # Must be 32 characters exactly - DD_URL: recorder # Used for logs intake - DD_PORT: 8080 # API port to use - DD_SITE: datadog.com DD_API_URL: http://recorder:8080 - DD_LOGS_INTAKE_URL: recorder:8080 - DD_TRACE_INTAKE_URL: http://recorder:8080 - DD_NO_SSL: "true" - DD_SKIP_SSL_VALIDATION: "true" - DD_USE_COMPRESSION: "false" - DD_ADDITIONAL_TARGET_LAMBDAS: "${EXTERNAL_LAMBDAS}" - DD_S3_BUCKET_NAME: "${DD_S3_BUCKET_NAME}" DD_FETCH_LAMBDA_TAGS: "${DD_FETCH_LAMBDA_TAGS:-false}" DD_FETCH_LOG_GROUP_TAGS: "${DD_FETCH_LOG_GROUP_TAGS:-false}" DD_FETCH_STEP_FUNCTIONS_TAGS: "${DD_FETCH_STEP_FUNCTIONS_TAGS:-false}" - DD_STORE_FAILED_EVENTS: "false" + DD_LOG_LEVEL: ${LOG_LEVEL:-info} + DD_LOGS_INTAKE_URL: recorder:8080 + DD_NO_SSL: "true" + DD_PORT: 8080 # API port to use + DD_S3_BUCKET_NAME: "${DD_S3_BUCKET_NAME}" + DD_SITE: datadog.com + DD_SKIP_SSL_VALIDATION: "true" + DD_STORE_FAILED_EVENTS: "${DD_STORE_FAILED_EVENTS:-true}" DD_TRACE_ENABLED: "true" + DD_TRACE_INTAKE_URL: http://recorder:8080 + DD_URL: recorder # Used for logs intake + DD_USE_COMPRESSION: "false" expose: - 8080 depends_on: recorder: condition: service_healthy healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8080/2015-03-31/functions/function/invocations"] - interval: 10s - timeout: 5s - retries: 3 - + test: + [ + "CMD", + "curl", + "-f", + "http://localhost:8080/2015-03-31/functions/function/invocations", + ] + interval: 10s + timeout: 5s tester: image: ${PYTHON_BASE} command: /bin/sh -c 'pip install "deepdiff<6" && python -m unittest discover' volumes: - ./tester:/tester - - ${SNAPSHOTS_DIR_NAME}:/snapshots + - "${SNAPSHOTS_DIR_NAME}:/snapshots" working_dir: /tester environment: - RECORDER_URL: http://recorder:8080/recording FORWARDER_URL: http://forwarder:8080/2015-03-31/functions/function/invocations - UPDATE_SNAPSHOTS: ${UPDATE_SNAPSHOTS:-false} - SNAPSHOTS_DIR_NAME: ${SNAPSHOTS_DIR_NAME} + RECORDER_URL: http://recorder:8080/recording + SNAPSHOTS_DIR_NAME: "${SNAPSHOTS_DIR_NAME}" + UPDATE_SNAPSHOTS: "${UPDATE_SNAPSHOTS:-false}" depends_on: - forwarder: - condition: service_healthy - recorder: - condition: service_healthy + forwarder: + condition: service_healthy + recorder: + condition: service_healthy diff --git a/aws/logs_monitoring/tools/integration_tests/integration_tests.sh b/aws/logs_monitoring/tools/integration_tests/integration_tests.sh index 8848b559..8d9521ca 100755 --- a/aws/logs_monitoring/tools/integration_tests/integration_tests.sh +++ b/aws/logs_monitoring/tools/integration_tests/integration_tests.sh @@ -23,6 +23,7 @@ CACHE_TEST=false DD_FETCH_LAMBDA_TAGS="true" DD_FETCH_LOG_GROUP_TAGS="true" DD_FETCH_STEP_FUNCTIONS_TAGS="true" +DD_STORE_FAILED_EVENTS="true" script_start_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") echo "Starting script time: $script_start_time" @@ -154,6 +155,7 @@ LOG_LEVEL=${LOG_LEVEL} \ DD_FETCH_LAMBDA_TAGS=${DD_FETCH_LAMBDA_TAGS} \ DD_FETCH_LOG_GROUP_TAGS=${DD_FETCH_LOG_GROUP_TAGS} \ DD_FETCH_STEP_FUNCTIONS_TAGS=${DD_FETCH_STEP_FUNCTIONS_TAGS} \ + DD_STORE_FAILED_EVENTS=${DD_STORE_FAILED_EVENTS} \ docker compose up --build --abort-on-container-exit if [ $ADDITIONAL_LAMBDA == true ]; then