Python: Emit token usage with streaming chat completion agent. (#12416)

moonbox3 · web-flow · commit 4b23389f7142 · 2025-06-09T23:12:05.000Z
### Motivation and Context The chat completion agent was not emitting token using during streaming invocation because we were only allowing through `response.items`. In the case of token usage, `response.items` is [] and the usage is contained as part of the message's `metadata` dict. This PR fixes that bug and allows for `response.items or response.metadata.get("usage")`. Two new samples are added to the concepts/agents/chat_completion dir to show how one can track token use for streaming and non-streaming agent invocation. Token usage handling is also added to the chat completion agent integration tests.  ### Description - Fixes a bug where we weren't emitting the streaming token usage for the chat completion agent. Also now includes the `prompt_tokens_details` and `completion_tokens_details` models that are returned, but not previously handled. - Adds new samples - Updates integration tests to track token usage and make sure they're non-zero. - Closes #12411  ### Contribution Checklist  - [X] The code builds clean without any errors or warnings - [X] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [X] All unit tests pass, and I have added new tests where possible - [X] I didn't break anyone 😄
diff --git a/python/samples/concepts/README.md b/python/samples/concepts/README.md
@@ -44,11 +44,13 @@
 
 - [Chat Completion Agent as Kernel Function](./agents/chat_completion_agent/chat_completion_agent_as_kernel_function.py)
 - [Chat Completion Agent Function Termination](./agents/chat_completion_agent/chat_completion_agent_function_termination.py)
-- [Chat Completion Agent Templating](./agents/chat_completion_agent/chat_completion_agent_prompt_templating.py)
 - [Chat Completion Agent Message Callback Streaming](./agents/chat_completion_agent/chat_completion_agent_message_callback_streaming.py)
 - [Chat Completion Agent Message Callback](./agents/chat_completion_agent/chat_completion_agent_message_callback.py)
+- [Chat Completion Agent Templating](./agents/chat_completion_agent/chat_completion_agent_prompt_templating.py)
+- [Chat Completion Agent Streaming Token Usage](./agents/chat_completion_agent/chat_completion_agent_streaming_token_usage.py)
 - [Chat Completion Agent Summary History Reducer Agent Chat](./agents/chat_completion_agent/chat_completion_agent_summary_history_reducer_agent_chat.py)
 - [Chat Completion Agent Summary History Reducer Single Agent](./agents/chat_completion_agent/chat_completion_agent_summary_history_reducer_single_agent.py)
+- [Chat Completion Agent Token Usage](./agents/chat_completion_agent/chat_completion_agent_token_usage.py)
 - [Chat Completion Agent Truncate History Reducer Agent Chat](./agents/chat_completion_agent/chat_completion_agent_truncate_history_reducer_agent_chat.py)
 - [Chat Completion Agent Truncate History Reducer Single Agent](./agents/chat_completion_agent/chat_completion_agent_truncate_history_reducer_single_agent.py)
 
diff --git a/python/samples/concepts/agents/chat_completion_agent/chat_completion_agent_streaming_token_usage.py b/python/samples/concepts/agents/chat_completion_agent/chat_completion_agent_streaming_token_usage.py
@@ -0,0 +1,110 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+from typing import Annotated
+
+from semantic_kernel.agents import ChatCompletionAgent, ChatHistoryAgentThread
+from semantic_kernel.connectors.ai.completion_usage import CompletionUsage
+from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
+from semantic_kernel.functions import kernel_function
+
+"""
+The following sample demonstrates how to create a chat completion agent
+and use it with streaming responses. It also shows how to track token 
+usage during the streaming process.
+"""
+
+
+# Define a sample plugin for the sample
+class MenuPlugin:
+    """A sample Menu Plugin used for the concept sample."""
+
+    @kernel_function(description="Provides a list of specials from the menu.")
+    def get_specials(self) -> Annotated[str, "Returns the specials from the menu."]:
+        return """
+        Special Soup: Clam Chowder
+        Special Salad: Cobb Salad
+        Special Drink: Chai Tea
+        """
+
+    @kernel_function(description="Provides the price of the requested menu item.")
+    def get_item_price(
+        self, menu_item: Annotated[str, "The name of the menu item."]
+    ) -> Annotated[str, "Returns the price of the menu item."]:
+        return "$9.99"
+
+
+async def main() -> None:
+    agent = ChatCompletionAgent(
+        service=AzureChatCompletion(),
+        name="Assistant",
+        instructions="Answer questions about the menu.",
+        plugins=[MenuPlugin()],
+    )
+
+    # Create a thread for the agent
+    # If no thread is provided, a new thread will be
+    # created and returned with the initial response
+    thread: ChatHistoryAgentThread = None
+
+    user_inputs = [
+        "Hello",
+        "What is the special soup?",
+        "How much does that cost?",
+        "Thank you",
+    ]
+
+    completion_usage = CompletionUsage()
+
+    for user_input in user_inputs:
+        print(f"\n# User: '{user_input}'")
+        async for response in agent.invoke_stream(
+            messages=user_input,
+            thread=thread,
+        ):
+            if response.content:
+                print(response.content, end="", flush=True)
+            if response.metadata.get("usage"):
+                completion_usage += response.metadata["usage"]
+                print(f"\nStreaming Usage: {response.metadata['usage']}")
+            thread = response.thread
+        print()
+
+    # Print the completion usage
+    print(f"\nStreaming Total Completion Usage: {completion_usage.model_dump_json(indent=4)}")
+
+    """
+    Sample Output:
+
+    # User: 'Hello'
+    Hello! How can I help you with the menu today?
+
+    # User: 'What is the special soup?'
+    The special soup today is Clam Chowder. Would you like more details or are you interested in something else from 
+        the menu?
+
+    # User: 'How much does that cost?'
+    The Clam Chowder special soup costs $9.99. Would you like to add it to your order or ask about something else?
+
+    # User: 'Thank you'
+    You're welcome! If you have any more questions or need help with the menu, just let me know. Enjoy your meal!
+
+    Streaming Total Completion Usage: {
+        "prompt_tokens": 1150,
+        "prompt_tokens_details": {
+            "audio_tokens": 0,
+            "cached_tokens": 0
+        },
+        "completion_tokens": 134,
+        "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "audio_tokens": 0,
+            "reasoning_tokens": 0,
+            "rejected_prediction_tokens": 0
+        }
+    }
+    """
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/samples/concepts/agents/chat_completion_agent/chat_completion_agent_token_usage.py b/python/samples/concepts/agents/chat_completion_agent/chat_completion_agent_token_usage.py
@@ -0,0 +1,111 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+from typing import Annotated
+
+from semantic_kernel.agents import ChatCompletionAgent, ChatHistoryAgentThread
+from semantic_kernel.connectors.ai.completion_usage import CompletionUsage
+from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
+from semantic_kernel.functions import kernel_function
+
+"""
+The following sample demonstrates how to create a chat completion agent
+and use it with non-streaming responses. It also shows how to track token 
+usage during agent invoke.
+"""
+
+
+# Define a sample plugin for the sample
+class MenuPlugin:
+    """A sample Menu Plugin used for the concept sample."""
+
+    @kernel_function(description="Provides a list of specials from the menu.")
+    def get_specials(self) -> Annotated[str, "Returns the specials from the menu."]:
+        return """
+        Special Soup: Clam Chowder
+        Special Salad: Cobb Salad
+        Special Drink: Chai Tea
+        """
+
+    @kernel_function(description="Provides the price of the requested menu item.")
+    def get_item_price(
+        self, menu_item: Annotated[str, "The name of the menu item."]
+    ) -> Annotated[str, "Returns the price of the menu item."]:
+        return "$9.99"
+
+
+async def main() -> None:
+    agent = ChatCompletionAgent(
+        service=AzureChatCompletion(),
+        name="Assistant",
+        instructions="Answer questions about the menu.",
+        plugins=[MenuPlugin()],
+    )
+
+    # Create a thread for the agent
+    # If no thread is provided, a new thread will be
+    # created and returned with the initial response
+    thread: ChatHistoryAgentThread = None
+
+    user_inputs = [
+        "Hello",
+        "What is the special soup?",
+        "How much does that cost?",
+        "Thank you",
+    ]
+
+    completion_usage = CompletionUsage()
+
+    for user_input in user_inputs:
+        print(f"\n# User: '{user_input}'")
+        async for response in agent.invoke(
+            messages=user_input,
+            thread=thread,
+        ):
+            if response.content:
+                print(response.content)
+            if response.metadata.get("usage"):
+                completion_usage += response.metadata["usage"]
+            thread = response.thread
+        print()
+
+    # Print the completion usage
+    print(f"\nNon-Streaming Total Completion Usage: {completion_usage.model_dump_json(indent=4)}")
+
+    """
+    Sample Output:
+
+    # User: 'Hello'
+    Hello! How can I help you with the menu today?
+
+
+    # User: 'What is the special soup?'
+    The special soup today is Clam Chowder. Would you like to know more about it or see the other specials?
+
+
+    # User: 'How much does that cost?'
+    The Clam Chowder special costs $9.99. Would you like to add that to your order or need more information?
+
+
+    # User: 'Thank you'
+    You're welcome! If you have any more questions or need help with the menu, just let me know. Enjoy your day!
+
+    Non-Streaming Total Completion Usage: {
+        "prompt_tokens": 772,
+        "prompt_tokens_details": {
+            "audio_tokens": 0,
+            "cached_tokens": 0
+        },
+        "completion_tokens": 92,
+        "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "audio_tokens": 0,
+            "reasoning_tokens": 0,
+            "rejected_prediction_tokens": 0
+        }
+    }
+    """
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/semantic_kernel/agents/chat_completion/chat_completion_agent.py b/python/semantic_kernel/agents/chat_completion/chat_completion_agent.py
@@ -452,7 +452,7 @@ async def invoke_stream(
 
                 if (
                     role == AuthorRole.ASSISTANT
-                    and response.items
+                    and (response.items or response.metadata.get("usage"))
                     and not any(
                         isinstance(item, (FunctionCallContent, FunctionResultContent)) for item in response.items
                     )
diff --git a/python/semantic_kernel/connectors/ai/completion_usage.py b/python/semantic_kernel/connectors/ai/completion_usage.py
@@ -1,27 +1,56 @@
 # Copyright (c) Microsoft. All rights reserved.
 
+
 from openai.types import CompletionUsage as OpenAICompletionUsage
+from openai.types.completion_usage import CompletionTokensDetails, PromptTokensDetails
 
 from semantic_kernel.kernel_pydantic import KernelBaseModel
 
 
 class CompletionUsage(KernelBaseModel):
-    """Completion usage information."""
+    """A class representing the usage of tokens in a completion request."""
 
     prompt_tokens: int | None = None
+    prompt_tokens_details: PromptTokensDetails | None = None
     completion_tokens: int | None = None
+    completion_tokens_details: CompletionTokensDetails | None = None
 
     @classmethod
     def from_openai(cls, openai_completion_usage: OpenAICompletionUsage):
-        """Create a CompletionUsage object from an OpenAI response."""
+        """Create a CompletionUsage instance from an OpenAICompletionUsage instance."""
         return cls(
             prompt_tokens=openai_completion_usage.prompt_tokens,
+            prompt_tokens_details=openai_completion_usage.prompt_tokens_details
+            if openai_completion_usage.prompt_tokens_details
+            else None,
             completion_tokens=openai_completion_usage.completion_tokens,
+            completion_tokens_details=openai_completion_usage.completion_tokens_details
+            if openai_completion_usage.completion_tokens_details
+            else None,
         )
 
     def __add__(self, other: "CompletionUsage") -> "CompletionUsage":
-        """Add two CompletionUsage objects."""
+        """Combine two CompletionUsage instances by summing their token counts."""
+
+        def _merge_details(cls, a, b):
+            """Merge two details objects by summing their fields."""
+            if a is None and b is None:
+                return None
+            kwargs = {}
+            for field in cls.__annotations__:
+                x = getattr(a, field, None)
+                y = getattr(b, field, None)
+                value = None if x is None and y is None else (x or 0) + (y or 0)
+                kwargs[field] = value
+            return cls(**kwargs)
+
         return CompletionUsage(
             prompt_tokens=(self.prompt_tokens or 0) + (other.prompt_tokens or 0),
             completion_tokens=(self.completion_tokens or 0) + (other.completion_tokens or 0),
+            prompt_tokens_details=_merge_details(
+                PromptTokensDetails, self.prompt_tokens_details, other.prompt_tokens_details
+            ),
+            completion_tokens_details=_merge_details(
+                CompletionTokensDetails, self.completion_tokens_details, other.completion_tokens_details
+            ),
         )
diff --git a/python/tests/integration/agents/chat_completion_agent/test_chat_completion_agent_integration.py b/python/tests/integration/agents/chat_completion_agent/test_chat_completion_agent_integration.py
@@ -5,6 +5,7 @@
 import pytest
 
 from semantic_kernel.agents import ChatCompletionAgent
+from semantic_kernel.connectors.ai.completion_usage import CompletionUsage
 from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion, OpenAIChatCompletion
 from semantic_kernel.contents import AuthorRole, ChatMessageContent, StreamingChatMessageContent
 from semantic_kernel.contents.image_content import ImageContent
@@ -86,10 +87,15 @@ async def test_invoke(self, chat_completion_agent: ChatCompletionAgent, agent_te
         """Test invoke of the agent."""
         responses = await agent_test_base.get_invoke_with_retry(chat_completion_agent, messages="Hello")
         assert len(responses) > 0
+        usage: CompletionUsage = CompletionUsage()
         for response in responses:
             assert isinstance(response.message, ChatMessageContent)
             assert response.message.role == AuthorRole.ASSISTANT
             assert response.message.content is not None
+            if response.metadata.get("usage"):
+                usage += response.metadata["usage"]
+        assert usage.prompt_tokens > 0
+        assert usage.completion_tokens > 0
 
     @pytest.mark.parametrize("chat_completion_agent", ["azure", "openai"], indirect=True, ids=["azure", "openai"])
     async def test_invoke_with_thread(self, chat_completion_agent: ChatCompletionAgent, agent_test_base: AgentTestBase):
@@ -115,10 +121,15 @@ async def test_invoke_stream(self, chat_completion_agent: ChatCompletionAgent, a
         """Test invoke stream of the agent."""
         responses = await agent_test_base.get_invoke_stream_with_retry(chat_completion_agent, messages="Hello")
         assert len(responses) > 0
+        usage: CompletionUsage = CompletionUsage()
         for response in responses:
             assert isinstance(response.message, StreamingChatMessageContent)
             assert response.message.role == AuthorRole.ASSISTANT
             assert response.message.content is not None
+            if response.metadata.get("usage"):
+                usage += response.metadata["usage"]
+        assert usage.prompt_tokens > 0
+        assert usage.completion_tokens > 0
 
     @pytest.mark.parametrize("chat_completion_agent", ["azure", "openai"], indirect=True, ids=["azure", "openai"])
     async def test_invoke_stream_with_thread(

Original file line number	Diff line number	Diff line change
`@@ -452,7 +452,7 @@ async def invoke_stream(`
`452`	`452`
`453`	`453`	`if (`
`454`	`454`	`role == AuthorRole.ASSISTANT`
`455`		`- and response.items`
	`455`	`+ and (response.items or response.metadata.get("usage"))`
`456`	`456`	`and not any(`
`457`	`457`	`isinstance(item, (FunctionCallContent, FunctionResultContent)) for item in response.items`
`458`	`458`	`)`