# APIM ‚ù§Ô∏è AI Foundry

## Test your Azure AI Foundry models, enabled through Azure API Management!

Use this Jupyter notebook with Python code snippets to verify proper functionality of your Azure AI Foundry models when accessed through AI Gateway features in Azure API Management (APIM).

<a id='0'></a>
### ‚öôÔ∏è Initialize client tool for your APIM service

üëâ An existing Azure AI Foundry API is expected to be already configured on APIM

In [None]:
import sys, json, requests
sys.path.insert(1, '../shared')  # add the shared directory to the Python path
import utils
from apimtools import APIMClientTool

model_name = "gpt-4.1"
inference_api_version = "2025-03-01-preview"

try:
    apimClientTool = APIMClientTool(
        "lab-ai-gateway" ## specify the resource group name where the API Management resource is located, or optionally add another parameter with the apim_resource_name
    )
    apimClientTool.initialize()
    apimClientTool.discover_api('/openai') # replace with /models for inference API

    apim_resource_gateway_url = str(apimClientTool.apim_resource_gateway_url)
    foundry_project_endpoint = f"{apim_resource_gateway_url.replace('apim-', 'foundry-').replace('.azure-api.net', '.services.ai.azure.com')}/api/projects/default"
    azure_endpoint = str(apimClientTool.azure_endpoint)
    chat_completions_url = f"{azure_endpoint}/openai/deployments/{model_name}/chat/completions?api-version={inference_api_version}"
    api_key = apimClientTool.apim_subscriptions[1].get("key") # Ensure that you have created a subscription in APIM

    utils.print_ok(f"Testing tool initialized successfully!")
except Exception as e:
    utils.print_error(f"Error initializing APIM Client Tool: {e}")



<a id='sdk'></a>
### üß™ Test the API using the Azure OpenAI Python SDK



In [None]:
from openai import AzureOpenAI

client = AzureOpenAI(
    azure_endpoint=azure_endpoint,
    api_key=api_key,
    api_version=inference_api_version
)
response = client.chat.completions.create(model=model_name, messages=[
                {"role": "system", "content": "You are a sarcastic, unhelpful assistant."},
                {"role": "user", "content": "Can you tell me the time, please?"}
])
print("üí¨ ",response.choices[0].message.content)

<a id='requests'></a>
### üß™ Test the API using a direct HTTP call


In [None]:
messages={"messages":[
    {"role": "system", "content": "You are a sarcastic, unhelpful assistant."},
    {"role": "user", "content": "Can you tell me the time, please?"}
]}
chat_completions_url = f"{azure_endpoint}/openai/deployments/{model_name}/chat/completions?api-version={inference_api_version}"
response = requests.post(chat_completions_url, headers = {'api-key':api_key}, json = messages)
utils.print_response_code(response)
utils.print_info(f"headers {response.headers}")
utils.print_info(f"x-ms-region: {response.headers.get("x-ms-region")}") # this header is useful to determine the region of the backend that served the request
if (response.status_code == 200):
    data = json.loads(response.text)
    print("üí¨ ", data.get("choices")[0].get("message").get("content"))
else:
    utils.print_error(response.text)

<a id='requests'></a>
### üß™ Send multiple requests within one minute to surpass the established token rate limit


In [None]:
import requests, json, time

# Run for 1 minute (60 seconds)
api_runs = []
start_time = time.time()
run_count = 0
messages={"messages":[
    {"role": "system", "content": "You are a sarcastic, unhelpful assistant."},
    {"role": "user", "content": "Can you tell me the time, please?"}
]}

print(f"üïê Starting API calls for 1 minute...")
print(f"Start time: {time.strftime('%H:%M:%S', time.localtime(start_time))}")

while (time.time() - start_time) < 60:  # Run for 60 seconds
    run_count += 1    
    call_start_time = time.time()
    response = requests.post(chat_completions_url, headers = {'api-key':api_key}, json = messages)
    elapsed_time = time.time() - start_time
    
    if (response.status_code == 200):
        print(f"‚ñ∂Ô∏è Run: {run_count} | {elapsed_time:.1f}s | status: {response.status_code} ‚úÖ")
        data = json.loads(response.text)
        total_tokens = data.get("usage").get("total_tokens")
        print(f"    consumed tokens: {response.headers.get('consumed-tokens')}, remaining tokens: {response.headers.get('remaining-tokens')}")
    else:
        print(f"‚ñ∂Ô∏è Run: {run_count} | {elapsed_time:.1f}s | status: {response.status_code} ‚õî")        
        print(f"    error: {response.text}")
        total_tokens = 0
    
    api_runs.append((call_start_time, total_tokens, response.status_code))
    time.sleep(0.1) # Small delay to prevent overwhelming the API

end_time = time.time()
total_duration = end_time - start_time
print(f"\nüèÅ Completed {run_count} API calls in {total_duration:.1f} seconds")
print(f"Average rate: {run_count / total_duration:.2f} calls/second")


<a id='plot'></a>
### üîç Analyze Token Rate limiting results


In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

if 'api_runs' in locals() and api_runs:
    calls = [(t - api_runs[0][0], tokens or 0, status) for t, tokens, status in api_runs]
    capacity = 1000
    refill = capacity / 60
    bucket = capacity
    last_time = 0.0
    times, usage, status_codes, levels = [], [], [], []

    for call_time, tokens, status in calls:
        bucket = min(capacity, bucket + (call_time - last_time) * refill)
        levels.append(bucket)
        times.append(call_time)
        usage.append(tokens)
        status_codes.append(status)
        bucket = max(0, bucket - tokens)
        last_time = call_time

    colors = ['tab:green' if code == 200 else 'tab:red' if code == 429 else 'tab:orange' for code in status_codes]
    fig, ax1 = plt.subplots(figsize=(14, 6))
    ax2 = ax1.twinx()

    ax1.bar(times, usage, color=colors, width=0.35, alpha=0.7)
    ax2.plot(times, levels, color='purple', linewidth=2)
    ax2.axhline(capacity, color='purple', linestyle='--', alpha=0.6)

    throttled_times = [t for t, code in zip(times, status_codes) if code == 429]
    throttled_usage = [u for u, code in zip(usage, status_codes) if code == 429]
    if throttled_times:
        max_usage = max(usage) if usage else 0
        throttled_marker_heights = [u + max_usage * 0.01 for u in throttled_usage]
        ax1.scatter(throttled_times, throttled_marker_heights, marker='o', s=20, color='darkred', edgecolors='white', linewidth=0.4, zorder=6)

    ax1.set_xlabel('Seconds')
    ax1.set_ylabel('Tokens per call')
    ax2.set_ylabel('Tokens in bucket')
    ax1.set_title('Token bucket behaviour over 60 seconds')

    legend_items = [
        Patch(facecolor='tab:green', alpha=0.7, label='Success (200)'),
        Line2D([0], [0], color='purple', linewidth=2, label='Bucket level'),
        Line2D([0], [0], color='purple', linestyle='--', label='Capacity'),
        Line2D([0], [0], marker='o', color='darkred', markersize=8, linestyle='None',
                markerfacecolor='darkred', markeredgecolor='white', label='Throttled (429)')
    ]
    ax1.legend(handles=legend_items, loc='upper right', bbox_to_anchor=(0.98, 0.85), framealpha=0.9)

    success = sum(code == 200 for code in status_codes)
    throttled = sum(code == 429 for code in status_codes)
    print(f"Calls: {len(status_codes)} | Success: {success} | 429s: {throttled}")
else:
    print('Run the 60-second API test first to capture api_runs data.')

<a id='Azure AI Agents'></a>
### üß™ Execute an [Azure AI Foundry Agent using MCP Tools](https://learn.microsoft.com/en-us/azure/ai-foundry/agents/how-to/tools/model-context-protocol)


In [None]:
from azure.ai.agents.models import ListSortOrder, MessageTextContent, McpTool, RequiredMcpToolCall, SubmitToolApprovalAction, ToolApproval
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
import time

weather_mcp_endpoint = ""

project_client = AIProjectClient(endpoint=foundry_project_endpoint,
            credential=DefaultAzureCredential())
agents_client = project_client.agents

# MCP tool definition
mcp_tool = McpTool(
    server_label="weather",
    server_url=f"https://{weather_mcp_endpoint}/sse",
)

prompt = "What's the weather in San Francisco, Seattle and Lisbon?"

# Agent creation
agent = agents_client.create_agent(
    model=model_name,
    name="agent-mcp",
    instructions="You are a weather agent.",
    tools=mcp_tool.definitions
)

print(f"üéâ Created agent, agent ID: {agent.id}")
print(f"‚ú® MCP Server: {mcp_tool.server_label} at {mcp_tool.server_url}")

# Thread creation
thread = agents_client.threads.create()
print(f"üßµ Created thread, thread ID: {thread.id}")

# Message creation
message = agents_client.messages.create(
    thread_id=thread.id,
    role="user",
    content=prompt,
)
print(f"üí¨ Created message, message ID: {message.id}")

mcp_tool.set_approval_mode("never")          # Disable human approval

# Run
run = agents_client.runs.create(thread_id=thread.id, agent_id=agent.id, tool_resources=mcp_tool.resources)
while run.status in ["queued", "in_progress", "requires_action"]:
    time.sleep(2)
    run = agents_client.runs.get(thread_id=thread.id, run_id=run.id)
    print(f"‚è≥ Run status: {run.status}")
if run.status == "failed":
    print(f"‚ùå Run error: {run.last_error}")

# Get Run steps
run_steps = agents_client.run_steps.list(thread_id=thread.id, run_id=run.id)
print()

for step in run_steps:
    print(f"üîÑ Run step: {step.id}, status: {step.status}, type: {step.type}")
    if step.type == "tool_calls":
        print(f"üõ†Ô∏è Tool call details:")
        for tool_call in step.step_details.tool_calls: ## type: ignore
            print(json.dumps(tool_call.as_dict(), indent=5))

# Get the messages in the thread
print("\nüìú Messages in the thread:")
messages = agents_client.messages.list(thread_id=thread.id, order=ListSortOrder.ASCENDING)

for item in messages:
    last_message_content = item.content[-1]
    if isinstance(last_message_content, MessageTextContent):
        print(f"üó®Ô∏è {item.role}: {last_message_content.text.value}")

# Clean up resources
# agents_client.delete_agent(agent.id) # Retain the agent to monitor its execution in AI Foundry.