# APIM ❤️ AI Foundry

## Test your Azure AI Foundry models, enabled through Azure API Management!

Use this Jupyter notebook with Python code snippets to verify proper functionality of your Azure AI Foundry models when accessed through AI Gateway features in Azure API Management (APIM).

<a id='0'></a>
### ⚙️ Initialize client tool for your APIM service

👉 An existing Azure AI Foundry API is expected to be already configured on APIM

In [None]:
import sys, json, requests
sys.path.insert(1, '../shared')  # add the shared directory to the Python path
import utils
from apimtools import APIMClientTool

model_name = "gpt-4.1-mini"
inference_api_version = "2025-03-01-preview"

try:
    apimClientTool = APIMClientTool(
        "lab-..." ## specify the resource group name where the API Management resource is located, or optionally add another parameter with the apim_resource_name
    )
    apimClientTool.initialize()
    apimClientTool.discover_api('/openai') # replace with /models for inference API

    azure_endpoint = str(apimClientTool.azure_endpoint)
    chat_completions_url = f"{azure_endpoint}/openai/deployments/{model_name}/chat/completions?api-version={inference_api_version}"
    api_key = apimClientTool.apim_subscriptions[1].get("key") # Ensure that you have created a subscription in APIM

    utils.print_ok(f"Testing tool initialized successfully!")
except Exception as e:
    utils.print_error(f"Error initializing APIM Client Tool: {e}")

<a id='sdk'></a>
### 🧪 Test the API using the Azure OpenAI Python SDK



In [None]:
import time
from openai import AzureOpenAI
    
client = AzureOpenAI(
    azure_endpoint=azure_endpoint,
    api_key=api_key,
    api_version=inference_api_version
)
response = client.chat.completions.create(model=model_name, messages=[
                {"role": "system", "content": "You are a sarcastic, unhelpful assistant."},
                {"role": "user", "content": "Can you tell me the time, please?"}
])
print("💬 ",response.choices[0].message.content)

<a id='requests'></a>
### 🧪 Test the API using a direct HTTP call


In [None]:
messages={"messages":[
    {"role": "system", "content": "You are a sarcastic, unhelpful assistant."},
    {"role": "user", "content": "Can you tell me the time, please?"}
]}
response = requests.post(chat_completions_url, headers = {'api-key':api_key}, json = messages)
utils.print_response_code(response)
utils.print_info(f"headers {response.headers}")
utils.print_info(f"x-ms-region: {response.headers.get("x-ms-region")}") # this header is useful to determine the region of the backend that served the request
if (response.status_code == 200):
    data = json.loads(response.text)
    print("💬 ", data.get("choices")[0].get("message").get("content"))
else:
    utils.print_error(response.text)

<a id='requests'></a>
### 🧪 Send multiple requests to surpass the established token rate limit


In [None]:
import requests

messages={"messages":[
    {"role": "system", "content": "You are a sarcastic, unhelpful assistant."},
    {"role": "user", "content": "Can you tell me the time, please?"}
]}

api_runs = []
for i in range(20):
    response = requests.post(chat_completions_url, headers = {'api-key': api_key}, json = messages)
    utils.print_response_code(response)
    if (response.status_code == 200):
        data = json.loads(response.text)
        total_tokens = data.get("usage").get("total_tokens")
        print("💬 ", data.get("choices")[0].get("message").get("content"))
    else:
        print(response.text)
        total_tokens = 0
    api_runs.append((total_tokens, response.status_code))

<a id='plot'></a>
### 🔍 Analyze Token Rate limiting results


In [None]:
# plot the results
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [15, 7]
df = pd.DataFrame(api_runs, columns=['Tokens', 'Status Code'])
df['Run'] = range(1, len(df) + 1)
colors = ['red' if str(code).startswith('5') else 'yellow' if str(code).startswith('4') else 'lightblue' for code in df['Status Code']]
ax = df.plot(kind='bar', x='Run', y='Tokens', color=colors, legend=False)
plt.title('Rate Limiting results')
plt.xlabel('Runs')
plt.ylabel('Tokens')
plt.xticks(df['Run'], rotation=0)
for i, val in enumerate(df['Status Code']):
    ax.text(i, 20, '' if int(val) == 200 else '[429]', ha='center', va='bottom')
for i, val in enumerate(df['Tokens']):
    ax.text(i, df['Tokens'][i] + 5, '' if int(val) == 0 else val, ha='center', va='bottom')
accumulated_tokens = df['Tokens'].cumsum()
ax.plot(df['Run']-1, accumulated_tokens, color='green', label='Accumulated Tokens')
for i, val in enumerate(accumulated_tokens):
    ax.text(i, val + 6, str(int(val)), ha='center', va='bottom', label='Accumulated Tokens')
plt.show()

<a id='loadbalancer'></a>
### 🧪 Test the Load Balancer


In [None]:
import requests, time

runs = 10
messages = {"messages": [
    {"role": "user", "content": "ping"}
]}
api_runs = []

# Initialize a session for connection pooling and set any default headers
session = requests.Session()
session.headers.update({'api-key': api_key})

try:
    for i in range(runs):
        print(f"▶️ Run {i+1}/{runs}:")
        start_time = time.time()
        response = session.post(chat_completions_url, json = messages)
        response_time = time.time() - start_time
        utils.print_response_code(response)
        if "x-ms-region" in response.headers:
            print(f"x-ms-region: \x1b[1;32m{response.headers.get("x-ms-region")}\x1b[0m") # this header is useful to determine the region of the backend that served the request
            api_runs.append((response_time, response.headers.get("x-ms-region")))

        if (response.status_code == 200):
            data = json.loads(response.text)
            print(f"💬 {data.get("choices")[0].get("message").get("content")}\n")
        else:
            utils.print_error(f"{response.text}\n")
finally:
    # Close the session to release the connection
    session.close()

<a id='plot'></a>
### 🔍 Analyze Load Balancing results


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle as pltRectangle
import matplotlib as mpl

mpl.rcParams['figure.figsize'] = [15, 7]
df = pd.DataFrame(api_runs, columns = ['Response Time', 'Region'])
df['Run'] = range(1, len(df) + 1)

# Define a color map for each region
color_map = {'East US 2': 'lightgreen', 'Sweden Central': 'lightblue'}  # Add more regions and colors as needed

# Plot the dataframe with colored bars
ax = df.plot(kind = 'bar', x = 'Run', y = 'Response Time', color = [color_map.get(region, 'gray') for region in df['Region']], legend = False)

# Add legend
legend_labels = [pltRectangle((0, 0), 1, 1, color = color_map.get(region, 'gray')) for region in df['Region'].unique()]
ax.legend(legend_labels, df['Region'].unique())

plt.title('Load Balancing results')
plt.xlabel('Run #')
plt.ylabel('Response Time')
plt.xticks(rotation = 0)

average = df['Response Time'].mean()
plt.axhline(y = average, color = 'r', linestyle = '--', label = f'Average: {average:.2f}')

plt.show()

<a id='semanticcaching'></a>
### 🧪 Test the Semantic Caching

The code below contains a list of questions that will be randomly selected and sent as prompts to the OpenAI API

In [None]:
from openai import AzureOpenAI
import time, random

runs = 10
questions = ["How to Brew the Perfect Cup of Coffee?",
             "What are the steps to Craft the Ideal Espresso?",
             "Tell me how to create the best steaming Java?",
             "Explain how to make a caffeinated brewed beverage?"]
api_runs = []  # Response Times for each run
client = AzureOpenAI(
    azure_endpoint = azure_endpoint, # The endpoint for the API with caching enabled
    api_key = api_key,
    api_version = inference_api_version
)

for i in range(runs):
    print(f"▶️ Run {i+1}/{runs}:")
    random_question = random.choice(questions)
    print("💬 ", random_question)

    start_time = time.time()
    response = client.chat.completions.create(
        model = model_name,
        messages = [
            {"role": "system", "content": "You are a sarcastic, unhelpful assistant that provide short responses."},
            {"role": "user", "content": random_question}
        ])
    response_time = time.time() - start_time
    print(f"⌚ {response_time:.2f} seconds")
    print(f"🗨️ {response.choices[0].message.content}\n")
    api_runs.append(response_time)


<a id='plot'></a>
### 🔍 Analyze Semantic Caching performance

The first request should take a longer time as it makes it all the way to the Azure OpenAI backend. The subsequent requests should be much quicker as they draw from the semantic cache. 

In [None]:
# plot the results
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams['figure.figsize'] = [15, 5]
df = pd.DataFrame(api_runs, columns=['Response Time'])
df['Run'] = range(1, len(df) + 1)
df.plot(kind='bar', x='Run', y='Response Time', legend=False)
plt.title('Semantic Caching Performance')
plt.xlabel('Runs')
plt.ylabel('Response Time (s)')
plt.xticks(rotation=0)  # Set x-axis ticks to be the run numbers

average = df['Response Time'].mean()
plt.axhline(y=average, color='r', linestyle='--', label=f'Average: {average:.2f}')
plt.legend()

plt.show()