In [1]:
#————————————————————

# Name: Azure OpenAI Cost Estimation (V1)

# Purpose: 

# Verify token count and estimate cost.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 10.01.2024
# Last Updated: 10.01.2024
# Python Version: 3.10.4

# General Sources:
# https://sdk.vercel.ai/
# https://github.com/LazaUK/AOAI-Streaming-TokenUsage/tree/main

# Azure Openai Usage:
# https://community.openai.com/t/whats-the-gpt-4-turbo-encoding/505059
# https://demiliani.com/2023/12/19/monitoring-your-azure-openai-usage/
# https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models

# Additionals:

# Download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\openai-lab\support\requirements\requirements.txt

#————————————————————

hello


In [67]:
# Import required libraries
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI
import tiktoken
import math


In [64]:
instructions = '''
You are a senior data analyst who will work with data in an xlsx file.
You have access to a sandboxed environment for writing python code.
When the user asks you to perform your actions, you will use the provided xlsx file.
You will perform data cleansing and transformation steps.
Execute each of the steps listed below in your ACTIONS section.

ACTIONS:

1. Read the xlsx file into a pandas DataFrame.
2. Keep only the columns "product_name", "level_1", "level_2", "level_3".
3. Trim and lowercase the values of columns "product_name", "level_1", "level_2", "level_3".
4. Remove non-alphanumeric characters from column "product_name".
5. Remove empty or NA rows from columns "product_name", "level_1", "level_2", "level_3".
6. Remove duplicate values from column "product_name" and prepare the results as Table_1.
7. Prepare Table_1 as an xlsx file for download by the user. 
8. Provide a summary paragraph explaining the preparation of the data set.

DO NOT:
1. Do not return any images. 
2. Do not return any other file types.
'''

In [83]:
instructions = [{
   "role": "user",
   "content": "Explain to me how tolenization is working in OpenAi models?",
   }]

In [81]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """
    Return the number of tokens used by a list of messages.
    """
    
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")

    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    
    num_tokens = 0

    if type(messages) == list:
        for message in messages:
            num_tokens += tokens_per_message
            for key, value in message.items():
                num_tokens += len(encoding.encode(value))
                if key == "name":
                    num_tokens += tokens_per_name
        num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    elif type(messages) == str:
        num_tokens += len(encoding.encode(messages))
    return num_tokens

In [85]:
num_tokens_from_messages(instructions, model="gpt-4")



41

In [87]:
# Function to calculate request cost (model example: gpt-4-0613)
def cost(prompt_tokens):
    # Price per 1000 tokens 
    input_price_per_1000 = 0.03
    output_price_per_1000 = 0.06
    code_interpreter = 0.03

    # Output
    completion_tokens = math.ceil(prompt_tokens * 0.2)
    
    # Calculate the cost for input and output tokens separately
    input_cost = (prompt_tokens / 1000) * input_price_per_1000
    output_cost = (completion_tokens / 1000) * output_price_per_1000
    
    # The total cost is the sum of input cost and output cost
    total_cost = input_cost + output_cost + code_interpreter
    return total_cost

In [88]:
cost(num_tokens_from_messages(instructions, model="gpt-4")) * 50



1.5885