# 비용 및 속도 평가
## LLM 비용 측정


In [9]:
import tiktoken
from openai import OpenAI

In [5]:
from dotenv import load_dotenv

load_dotenv()

True

In [6]:
def num_tokens_from_messages(messages, model="gpt-4o-mini"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")

    tokens_per_message = 3
    tokens_per_name = 1

    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>

    return num_tokens

In [12]:
def calculate_tokens(messages, model="gpt-3.5-turbo"):
    # Initialize the client
    client = OpenAI()
    
    # Make the API call
    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    
    # Get the output tokens
    output_tokens = response.usage.completion_tokens
    input_tokens = num_tokens_from_messages(messages, model=model)

    return input_tokens, output_tokens


In [13]:
# Example usage
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What's the weather like today?"},
]

input_tokens, output_tokens = calculate_tokens(messages)

print(f"Input tokens: {input_tokens}")
print(f"Output tokens: {output_tokens}")
print(f"Total tokens: {input_tokens + output_tokens}")

Input tokens: 24
Output tokens: 37
Total tokens: 61


[여기서](https://openai.com/api/pricing/) OpenAI API의 비용을 확인할 수 있습니다.

## LLM 속도 측정

In [14]:
import time
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI()

def measure_streaming_metrics(prompt):
    start_time = time.time()
    first_token_time = None
    last_token_time = None
    token_count = 0
    total_inter_token_time = 0

    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        stream=True
    )

    for chunk in stream:
        if token_count == 0:
            first_token_time = time.time()
        
        current_time = time.time()
        if token_count > 0:
            total_inter_token_time += current_time - last_token_time
        
        last_token_time = current_time
        token_count += len(chunk.choices[0].delta.content or "")

    time_to_first_token = first_token_time - start_time
    time_to_last_token = last_token_time - start_time
    avg_inter_token_latency = total_inter_token_time / (token_count - 1) if token_count > 1 else 0

    return {
        "time_to_first_token": time_to_first_token,
        "time_to_last_token": time_to_last_token,
        "avg_inter_token_latency": avg_inter_token_latency,
        "total_tokens": token_count
    }

In [15]:
# Example usage
prompt = "Tell me a short joke"
metrics = measure_streaming_metrics(prompt)

print(f"Time to first token: {metrics['time_to_first_token']:.4f} seconds")
print(f"Time to last token: {metrics['time_to_last_token']:.4f} seconds")
print(f"Average inter-token latency: {metrics['avg_inter_token_latency']:.4f} seconds")
print(f"Total tokens: {metrics['total_tokens']}")

Time to first token: 0.5867 seconds
Time to last token: 0.7636 seconds
Average inter-token latency: 0.0023 seconds
Total tokens: 78
