# Simple Endpoint Benchmarking

### Introduction

- you can check more details on [Artificial Analysis](https://artificialanalysis.ai/methodology) website.
- Also check the benchmarking provided by [Databricks](https://docs.databricks.com/en/_extras/notebooks/source/machine-learning/large-language-models/llm-benchmarking.html)

### OpenAI API

In [9]:
import requests

# Configuration
API_KEY = "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"  # Replace with your OpenAI API key
endpoint_url = "https://api.openai.com/v1/chat/completions"
input_tokens = 2048
output_tokens = 256
num_queries_per_thread = 20

# Setup the headers
headers = {
    'Authorization': f'Bearer {API_KEY}',
    'Content-Type': 'application/json'
}

# Setup the data
data = {
    "model": "gpt-4o-mini",
    "messages": [
        {
            "role": "system",
            "content": "You are a helpful assistant."
        }
    ],
    "max_tokens": 100,
    "temperature": 0.0,
    "stop": ["\n", "System:"]
}

# Make the requests
response = requests.post(endpoint_url, headers=headers, json=data)
response.raise_for_status()
response_json = response.json()
print(response_json)


{'id': 'chatcmpl-9uc9eSHlb9BQo6kDD7qQHJ1BhW4Rq', 'object': 'chat.completion', 'created': 1723280062, 'model': 'gpt-4o-mini-2024-07-18', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'How can I assist you today?', 'refusal': None}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 13, 'completion_tokens': 7, 'total_tokens': 20}, 'system_fingerprint': 'fp_48196bc67a'}


In [8]:
import asyncio
import time
import aiohttp
import requests
import json
import statistics
import matplotlib.pyplot as plt
import math

# Configuration
API_KEY = "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"  # Replace with your OpenAI API key
endpoint_url = "https://api.openai.com/v1/chat/completions"
input_tokens = 2048
output_tokens = 256
num_queries_per_thread = 20

# Setup the headers
headers = {
    'Authorization': f'Bearer {API_KEY}',
    'Content-Type': 'application/json'
}

def get_request(in_tokens, out_tokens):
    # Adjust this function to generate a prompt with the specified number of input tokens.
    prompt = " ".join(["word"] * in_tokens)
    return {
        "model": "gpt-4o-mini",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": out_tokens,
        "temperature": 0.2,
        "stop": ["\n", "System:"]
    }

# Sends an initial set of warm-up requests and validates the response
def warm_up_and_validate(in_tokens=2048, out_tokens=256, warm_up_requests=10):
    input_data = get_request(in_tokens, out_tokens)
    session = requests.Session()

    for _ in range(warm_up_requests):
        resp = session.post(endpoint_url, headers=headers, json=input_data)
        result = resp.json()
        print(result)
        assert result['usage']['completion_tokens'] == out_tokens, f"Model received {result['usage']['completion_tokens']} output tokens, expected {out_tokens}."
        assert result['usage']['prompt_tokens'] == in_tokens, f"Model received {result['usage']['prompt_tokens']} input tokens, expected {in_tokens}. Please adjust the input prompt."

warm_up_and_validate(input_tokens, output_tokens)


{'error': {'message': "Sorry! We've encountered an issue with repetitive patterns in your prompt. Please try again with a different prompt.", 'type': 'invalid_request_error', 'param': 'prompt', 'code': 'invalid_prompt'}}


KeyError: 'usage'

In [3]:

latencies = []

# This is a single worker, which processes the given number of requests, one after the other.
async def worker(index, num_requests, in_tokens=2048, out_tokens=256):
    input_data = get_request(in_tokens, out_tokens)
    await asyncio.sleep(0.1 * index)

    for i in range(num_requests):
        request_start_time = time.time()

        success = False
        while not success:
            timeout = aiohttp.ClientTimeout(total=3 * 3600)
            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.post(endpoint_url, headers=headers, json=input_data) as response:
                    success = response.ok
                    chunks = []
                    async for chunk, _ in response.content.iter_chunks():
                        chunks.append(chunk)
        latency = time.time() - request_start_time
        result = json.loads(b''.join(chunks))
        latencies.append((result['usage']['prompt_tokens'],
                          result['usage']['completion_tokens'], latency))

# This code runs parallel sets of queries with num_requests_per_worker queries per worker.
async def single_benchmark(num_requests_per_worker, num_workers, in_tokens=2048, out_tokens=256):
    tasks = []
    for i in range(num_workers):
        task = asyncio.create_task(worker(i, num_requests_per_worker, in_tokens, out_tokens))
        tasks.append(task)
    await asyncio.gather(*tasks)

# This runs the benchmark with 1, n/2 and n output tokens to derive time to first token and time per token.
async def benchmark(parallel_queries=1, in_tokens=2048, out_tokens=256, num_tries=5):
    avg_num_input_tokens = [0, 0, 0]
    avg_num_output_tokens = [0, 0, 0]
    median_latency = [0, 0, 0]
    print(f"Parallel queries {parallel_queries}")

    for i, out_tokens in enumerate([1, out_tokens // 2, out_tokens]):
        latencies.clear()
        await single_benchmark(num_tries, parallel_queries, in_tokens, out_tokens)
        avg_num_input_tokens[i] = statistics.mean([inp for inp, _, _ in latencies])
        avg_num_output_tokens[i] = statistics.mean([outp for _, outp, _ in latencies])
        median_latency[i] = statistics.median([latency for _, _, latency in latencies])
        tokens_per_sec = (avg_num_input_tokens[i] + avg_num_output_tokens[i]) * parallel_queries / median_latency[i]
        print(f'Output tokens {avg_num_output_tokens[i]}, median latency (s): {round(median_latency[i], 2)}, tokens per second {round(tokens_per_sec, 1)}')

    output_token_time = (median_latency[2] - median_latency[1]) * 1000 / (avg_num_output_tokens[2] - avg_num_output_tokens[1])
    print(f'Time to first token (s): {round(median_latency[0], 2)}, Time per output token (ms): {round(output_token_time, 2)}')
    data.append([median_latency[2],
                 (avg_num_input_tokens[2] + avg_num_output_tokens[2]) * parallel_queries / median_latency[2]])

# This will run until the throughput of the model is no longer increasing by 10%.
data = []
for parallel_queries in [1, 2, 4, 8]:
    print(f"Input tokens {input_tokens}")
    await benchmark(parallel_queries, input_tokens, output_tokens, num_queries_per_thread)
    if len(data) > 1 and (data[-1][1] - data[-2][1]) / data[-2][1] < 0.1:
        break

# Plot the latency vs throughput curve
plt.xlabel("Latency (s)")
plt.ylabel("Throughput (tok/s)")
plt.plot([x[0] for x in data], [x[1] for x in data], marker='o')
plt.show()


{'error': {'message': "We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)", 'type': 'invalid_request_error', 'param': None, 'code': None}}


KeyError: 'usage'