Skip to content

Commit

Permalink
chore(llmobs): token metrics name changes (#9657)
Browse files Browse the repository at this point in the history
Makes the following updates to metric key names submitted to LLM
Observability for openai & bedrock integrations

`prompt_tokens` -> `input_tokens`
`completion_tokens` -> `output_tokens`

The backend already has the changes in place to accept these updated key
names so a hard cutover is OK.

A release note is not needed since metric key names used by our
integrations (openai, langchain, bedrock) when submitting data to LLM
Obs backend is an internal contract between the integration and backend.

When users set metric key names for manually created spans, our
documentation already instructs them to use input/output terminology

- [x] Change(s) are motivated and described in the PR description
- [x] Testing strategy is described if automated tests are not included
in the PR
- [x] Risks are described (performance impact, potential for breakage,
maintainability)
- [x] Change is maintainable (easy to change, telemetry, documentation)
- [x] [Library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
are followed or label `changelog/no-changelog` is set
- [x] Documentation is included (in-code, generated user docs, [public
corp docs](https://github.com/DataDog/documentation/))
- [x] Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))
- [x] If this PR changes the public interface, I've notified
`@DataDog/apm-tees`.

- [x] Title is accurate
- [x] All changes are related to the pull request's stated goal
- [x] Description motivates each change
- [x] Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- [x] Testing strategy adequately addresses listed risks
- [x] Change is maintainable (easy to change, telemetry, documentation)
- [x] Release note makes sense to a user of the library
- [x] Author has acknowledged and discussed the performance implications
of this PR as reported in the benchmarks PR comment
- [x] Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Co-authored-by: lievan <evan.li@datadoqhq.com>
Co-authored-by: kyle <kyle@verhoog.ca>
  • Loading branch information
3 people committed Jul 12, 2024
1 parent 1b4b2b9 commit 36ffabe
Show file tree
Hide file tree
Showing 15 changed files with 107 additions and 85 deletions.
4 changes: 4 additions & 0 deletions ddtrace/llmobs/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,7 @@
)

LANGCHAIN_APM_SPAN_NAME = "langchain.request"

INPUT_TOKENS_METRIC_KEY = "input_tokens"
OUTPUT_TOKENS_METRIC_KEY = "output_tokens"
TOTAL_TOKENS_METRIC_KEY = "total_tokens"
17 changes: 10 additions & 7 deletions ddtrace/llmobs/_integrations/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@
from ddtrace._trace.span import Span
from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import INPUT_MESSAGES
from ddtrace.llmobs._constants import INPUT_TOKENS_METRIC_KEY
from ddtrace.llmobs._constants import METADATA
from ddtrace.llmobs._constants import METRICS
from ddtrace.llmobs._constants import MODEL_NAME
from ddtrace.llmobs._constants import MODEL_PROVIDER
from ddtrace.llmobs._constants import OUTPUT_MESSAGES
from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY
from ddtrace.llmobs._constants import SPAN_KIND
from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY

from .base import BaseLLMIntegration

Expand Down Expand Up @@ -175,16 +178,16 @@ def record_usage(self, span: Span, usage: Dict[str, Any]) -> None:
@staticmethod
def _get_llmobs_metrics_tags(span):
usage = {}
prompt_tokens = span.get_metric("anthropic.response.usage.input_tokens")
completion_tokens = span.get_metric("anthropic.response.usage.output_tokens")
input_tokens = span.get_metric("anthropic.response.usage.input_tokens")
output_tokens = span.get_metric("anthropic.response.usage.output_tokens")
total_tokens = span.get_metric("anthropic.response.usage.total_tokens")

if prompt_tokens is not None:
usage["prompt_tokens"] = prompt_tokens
if completion_tokens is not None:
usage["completion_tokens"] = completion_tokens
if input_tokens is not None:
usage[INPUT_TOKENS_METRIC_KEY] = input_tokens
if output_tokens is not None:
usage[OUTPUT_TOKENS_METRIC_KEY] = output_tokens
if total_tokens is not None:
usage["total_tokens"] = total_tokens
usage[TOTAL_TOKENS_METRIC_KEY] = total_tokens
return usage


Expand Down
9 changes: 6 additions & 3 deletions ddtrace/llmobs/_integrations/bedrock.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@
from ddtrace._trace.span import Span
from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import INPUT_MESSAGES
from ddtrace.llmobs._constants import INPUT_TOKENS_METRIC_KEY
from ddtrace.llmobs._constants import METADATA
from ddtrace.llmobs._constants import METRICS
from ddtrace.llmobs._constants import MODEL_NAME
from ddtrace.llmobs._constants import MODEL_PROVIDER
from ddtrace.llmobs._constants import OUTPUT_MESSAGES
from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY
from ddtrace.llmobs._constants import PARENT_ID_KEY
from ddtrace.llmobs._constants import PROPAGATED_PARENT_ID_KEY
from ddtrace.llmobs._constants import SPAN_KIND
from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY
from ddtrace.llmobs._integrations import BaseLLMIntegration
from ddtrace.llmobs._utils import _get_llmobs_parent_id

Expand Down Expand Up @@ -61,9 +64,9 @@ def _llmobs_metrics(span: Span, formatted_response: Optional[Dict[str, Any]]) ->
if formatted_response and formatted_response.get("text"):
prompt_tokens = int(span.get_tag("bedrock.usage.prompt_tokens") or 0)
completion_tokens = int(span.get_tag("bedrock.usage.completion_tokens") or 0)
metrics["prompt_tokens"] = prompt_tokens
metrics["completion_tokens"] = completion_tokens
metrics["total_tokens"] = prompt_tokens + completion_tokens
metrics[INPUT_TOKENS_METRIC_KEY] = prompt_tokens
metrics[OUTPUT_TOKENS_METRIC_KEY] = completion_tokens
metrics[TOTAL_TOKENS_METRIC_KEY] = prompt_tokens + completion_tokens
return metrics

@staticmethod
Expand Down
15 changes: 9 additions & 6 deletions ddtrace/llmobs/_integrations/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@
from ddtrace.internal.constants import COMPONENT
from ddtrace.internal.utils.version import parse_version
from ddtrace.llmobs._constants import INPUT_MESSAGES
from ddtrace.llmobs._constants import INPUT_TOKENS_METRIC_KEY
from ddtrace.llmobs._constants import METADATA
from ddtrace.llmobs._constants import METRICS
from ddtrace.llmobs._constants import MODEL_NAME
from ddtrace.llmobs._constants import MODEL_PROVIDER
from ddtrace.llmobs._constants import OUTPUT_MESSAGES
from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY
from ddtrace.llmobs._constants import SPAN_KIND
from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY
from ddtrace.llmobs._integrations.base import BaseLLMIntegration
from ddtrace.pin import Pin

Expand Down Expand Up @@ -221,17 +224,17 @@ def _set_llmobs_metrics_tags(span: Span, resp: Any, streamed: bool = False) -> D
completion_tokens = span.get_metric("openai.response.usage.completion_tokens") or 0
metrics.update(
{
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
INPUT_TOKENS_METRIC_KEY: prompt_tokens,
OUTPUT_TOKENS_METRIC_KEY: completion_tokens,
TOTAL_TOKENS_METRIC_KEY: prompt_tokens + completion_tokens,
}
)
elif resp:
metrics.update(
{
"prompt_tokens": resp.usage.prompt_tokens,
"completion_tokens": resp.usage.completion_tokens,
"total_tokens": resp.usage.prompt_tokens + resp.usage.completion_tokens,
INPUT_TOKENS_METRIC_KEY: resp.usage.prompt_tokens,
OUTPUT_TOKENS_METRIC_KEY: resp.usage.completion_tokens,
TOTAL_TOKENS_METRIC_KEY: resp.usage.prompt_tokens + resp.usage.completion_tokens,
}
)
return metrics
24 changes: 12 additions & 12 deletions tests/contrib/anthropic/test_anthropic_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_completion(self, anthropic, ddtrace_global_config, mock_llmobs_writer,
],
output_messages=[{"content": 'THE BEST-SELLING BOOK OF ALL TIME IS "DON', "role": "assistant"}],
metadata={"temperature": 0.8, "max_tokens": 15.0},
token_metrics={"prompt_tokens": 32, "completion_tokens": 15, "total_tokens": 47},
token_metrics={"input_tokens": 32, "output_tokens": 15, "total_tokens": 47},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -153,7 +153,7 @@ def test_stream(self, anthropic, ddtrace_global_config, mock_llmobs_writer, mock
{"content": 'The phrase "I think, therefore I am" (originally in Latin as', "role": "assistant"}
],
metadata={"temperature": 0.8, "max_tokens": 15.0},
token_metrics={"prompt_tokens": 27, "completion_tokens": 15, "total_tokens": 42},
token_metrics={"input_tokens": 27, "output_tokens": 15, "total_tokens": 42},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -210,7 +210,7 @@ def test_stream_helper(self, anthropic, ddtrace_global_config, mock_llmobs_write
}
],
metadata={"temperature": 0.8, "max_tokens": 15.0},
token_metrics={"prompt_tokens": 27, "completion_tokens": 15, "total_tokens": 42},
token_metrics={"input_tokens": 27, "output_tokens": 15, "total_tokens": 42},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -265,7 +265,7 @@ def test_image(self, anthropic, ddtrace_global_config, mock_llmobs_writer, mock_
}
],
metadata={"temperature": 0.8, "max_tokens": 15.0},
token_metrics={"prompt_tokens": 246, "completion_tokens": 15, "total_tokens": 261},
token_metrics={"input_tokens": 246, "output_tokens": 15, "total_tokens": 261},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -303,7 +303,7 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,
{"content": WEATHER_OUTPUT_MESSAGE_2, "role": "assistant"},
],
metadata={"temperature": 1.0, "max_tokens": 200.0},
token_metrics={"prompt_tokens": 599, "completion_tokens": 152, "total_tokens": 751},
token_metrics={"input_tokens": 599, "output_tokens": 152, "total_tokens": 751},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -356,7 +356,7 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,
}
],
metadata={"temperature": 1.0, "max_tokens": 500.0},
token_metrics={"prompt_tokens": 768, "completion_tokens": 29, "total_tokens": 797},
token_metrics={"input_tokens": 768, "output_tokens": 29, "total_tokens": 797},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -395,7 +395,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w
{"content": WEATHER_OUTPUT_MESSAGE_2, "role": "assistant"},
],
metadata={"temperature": 1.0, "max_tokens": 200.0},
token_metrics={"prompt_tokens": 599, "completion_tokens": 152, "total_tokens": 751},
token_metrics={"input_tokens": 599, "output_tokens": 152, "total_tokens": 751},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -448,7 +448,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w
}
],
metadata={"temperature": 1.0, "max_tokens": 500.0},
token_metrics={"prompt_tokens": 768, "completion_tokens": 29, "total_tokens": 797},
token_metrics={"input_tokens": 768, "output_tokens": 29, "total_tokens": 797},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -497,7 +497,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w
{"content": message[1]["text"], "role": "assistant"},
],
metadata={"temperature": 1.0, "max_tokens": 200.0},
token_metrics={"prompt_tokens": 599, "completion_tokens": 135, "total_tokens": 734},
token_metrics={"input_tokens": 599, "output_tokens": 135, "total_tokens": 734},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -547,7 +547,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w
}
],
metadata={"temperature": 1.0, "max_tokens": 500.0},
token_metrics={"prompt_tokens": 762, "completion_tokens": 33, "total_tokens": 795},
token_metrics={"input_tokens": 762, "output_tokens": 33, "total_tokens": 795},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -591,7 +591,7 @@ async def test_tools_async_stream_helper(
{"content": WEATHER_OUTPUT_MESSAGE_2, "role": "assistant"},
],
metadata={"temperature": 1.0, "max_tokens": 200.0},
token_metrics={"prompt_tokens": 599, "completion_tokens": 146, "total_tokens": 745},
token_metrics={"input_tokens": 599, "output_tokens": 146, "total_tokens": 745},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -643,7 +643,7 @@ async def test_tools_async_stream_helper(
{"content": "\n\nThe current weather in San Francisco, CA is 73°F.", "role": "assistant"}
],
metadata={"temperature": 1.0, "max_tokens": 500.0},
token_metrics={"prompt_tokens": 762, "completion_tokens": 18, "total_tokens": 780},
token_metrics={"input_tokens": 762, "output_tokens": 18, "total_tokens": 780},
tags={"ml_app": "<ml-app-name>"},
)
)
4 changes: 2 additions & 2 deletions tests/contrib/botocore/test_bedrock_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ def expected_llmobs_span_event(span, n_output, message=False):
output_messages=[{"content": mock.ANY} for _ in range(n_output)],
metadata=expected_parameters,
token_metrics={
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"input_tokens": prompt_tokens,
"output_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
},
tags={"service": "aws.bedrock-runtime", "ml_app": "<ml-app-name>"},
Expand Down
26 changes: 13 additions & 13 deletions tests/contrib/openai/test_openai_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_completion(self, openai, ddtrace_global_config, mock_llmobs_writer, moc
input_messages=[{"content": "Hello world"}],
output_messages=[{"content": ", relax!” I said to my laptop"}, {"content": " (1"}],
metadata={"temperature": 0.8, "max_tokens": 10},
token_metrics={"prompt_tokens": 2, "completion_tokens": 12, "total_tokens": 14},
token_metrics={"input_tokens": 2, "output_tokens": 12, "total_tokens": 14},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand All @@ -58,7 +58,7 @@ def test_completion_stream(self, openai, ddtrace_global_config, mock_llmobs_writ
input_messages=[{"content": "Hello world"}],
output_messages=[{"content": expected_completion}],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 2, "completion_tokens": 16, "total_tokens": 18},
token_metrics={"input_tokens": 2, "output_tokens": 16, "total_tokens": 18},
tags={"ml_app": "<ml-app-name>"},
),
)
Expand Down Expand Up @@ -95,7 +95,7 @@ def test_chat_completion(self, openai, ddtrace_global_config, mock_llmobs_writer
input_messages=input_messages,
output_messages=[{"role": "assistant", "content": choice.message.content} for choice in resp.choices],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 57, "completion_tokens": 34, "total_tokens": 91},
token_metrics={"input_tokens": 57, "output_tokens": 34, "total_tokens": 91},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -132,7 +132,7 @@ async def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_
input_messages=input_messages,
output_messages=[{"content": expected_completion, "role": "assistant"}],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 8, "completion_tokens": 12, "total_tokens": 20},
token_metrics={"input_tokens": 8, "output_tokens": 12, "total_tokens": 20},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -164,7 +164,7 @@ def test_chat_completion_function_call(self, openai, ddtrace_global_config, mock
input_messages=[{"content": chat_completion_input_description, "role": "user"}],
output_messages=[{"content": expected_output, "role": "assistant"}],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 157, "completion_tokens": 57, "total_tokens": 214},
token_metrics={"input_tokens": 157, "output_tokens": 57, "total_tokens": 214},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -200,7 +200,7 @@ def test_chat_completion_function_call_stream(self, openai, ddtrace_global_confi
input_messages=[{"content": chat_completion_input_description, "role": "user"}],
output_messages=[{"content": expected_output, "role": "assistant"}],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 63, "completion_tokens": 33, "total_tokens": 96},
token_metrics={"input_tokens": 63, "output_tokens": 33, "total_tokens": 96},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand All @@ -227,7 +227,7 @@ def test_chat_completion_tool_call(self, openai, ddtrace_global_config, mock_llm
input_messages=[{"content": chat_completion_input_description, "role": "user"}],
output_messages=[{"content": expected_output, "role": "assistant"}],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 157, "completion_tokens": 57, "total_tokens": 214},
token_metrics={"input_tokens": 157, "output_tokens": 57, "total_tokens": 214},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -337,7 +337,7 @@ def test_completion(self, openai, ddtrace_global_config, mock_llmobs_writer, moc
input_messages=[{"content": "Hello world"}],
output_messages=[{"content": ", relax!” I said to my laptop"}, {"content": " (1"}],
metadata={"temperature": 0.8, "max_tokens": 10},
token_metrics={"prompt_tokens": 2, "completion_tokens": 12, "total_tokens": 14},
token_metrics={"input_tokens": 2, "output_tokens": 12, "total_tokens": 14},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand All @@ -364,7 +364,7 @@ def test_completion_stream(self, openai, ddtrace_global_config, mock_llmobs_writ
input_messages=[{"content": "Hello world"}],
output_messages=[{"content": expected_completion}],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 2, "completion_tokens": 2, "total_tokens": 4},
token_metrics={"input_tokens": 2, "output_tokens": 2, "total_tokens": 4},
tags={"ml_app": "<ml-app-name>"},
),
)
Expand Down Expand Up @@ -400,7 +400,7 @@ def test_chat_completion(self, openai, ddtrace_global_config, mock_llmobs_writer
input_messages=input_messages,
output_messages=[{"role": "assistant", "content": choice.message.content} for choice in resp.choices],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 57, "completion_tokens": 34, "total_tokens": 91},
token_metrics={"input_tokens": 57, "output_tokens": 34, "total_tokens": 91},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -438,7 +438,7 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs
input_messages=input_messages,
output_messages=[{"content": expected_completion, "role": "assistant"}],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 8, "completion_tokens": 8, "total_tokens": 16},
token_metrics={"input_tokens": 8, "output_tokens": 8, "total_tokens": 16},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -469,7 +469,7 @@ def test_chat_completion_function_call(self, openai, ddtrace_global_config, mock
input_messages=[{"content": chat_completion_input_description, "role": "user"}],
output_messages=[{"content": expected_output, "role": "assistant"}],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 157, "completion_tokens": 57, "total_tokens": 214},
token_metrics={"input_tokens": 157, "output_tokens": 57, "total_tokens": 214},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down Expand Up @@ -503,7 +503,7 @@ def test_chat_completion_tool_call(self, openai, ddtrace_global_config, mock_llm
}
],
metadata={"temperature": 0},
token_metrics={"prompt_tokens": 157, "completion_tokens": 57, "total_tokens": 214},
token_metrics={"input_tokens": 157, "output_tokens": 57, "total_tokens": 214},
tags={"ml_app": "<ml-app-name>"},
)
)
Expand Down
Loading

0 comments on commit 36ffabe

Please sign in to comment.