In [1]:
from decimal import Decimal
import functools
import itertools
import pprint
import re

import attrs
import IPython
import bs4
import pandas as pd
import requests

In [2]:
_RATE_PATTERN = re.compile(r'\A\s*\$([\d.]+)\s*/\s*1K\s+tokens\s*\Z')
"""Regex to parse rates. Currently all rates are given per 1K tokens."""

_RATE_PATTERN_DENOMINATOR = 1000
"""The denominator that _RATE_PATTERN currently works with."""

'The denominator that _RATE_PATTERN currently works with.'

In [3]:
class PriceRetrievalError(Exception):
    """Model pricing data couldn't be obtained."""

In [4]:
@attrs.frozen
class Rate:
    """A pricing rate."""

    numerator: Decimal
    """The stated price in US dollars, per some number of tokens."""

    denominator: int
    """The number of tokens the stated price is for."""

    def __str__(self):
        """Representation of this rate suitable for user interfaces."""
        return f'${self.numerator} per {self.denominator} tokens'

In [5]:
def _parse_rate(text: str) -> Rate:
    """Parse pricing from informal text, returning a Rate."""
    match = _RATE_PATTERN.fullmatch(text)
    if match is None:
        raise PriceRetrievalError(f"can't parse rate: {text.strip()}")
    return Rate(Decimal(match[1]), _RATE_PATTERN_DENOMINATOR)

In [6]:
@functools.cache
def get_pricing_page() -> str:
    """Retrieve the pricing page from the OpenAI website."""
    response = requests.get('https://openai.com/api/pricing/')
    response.raise_for_status()
    return response.text

In [7]:
def _need(condition: bool) -> None:
    """Raise PriceRetrievalError if the condition is not satisfied."""
    if not condition:
        raise PriceRetrievalError("can't find embedding model prices on page")

In [8]:
def find_embedding_model_prices(
    *, displayed_only: bool = False
) -> dict[str, Rate]:
    """Retrieve the prices of embedding models."""
    doc = bs4.BeautifulSoup(get_pricing_page())

    headings = doc.find_all('h3', text='Embedding models')
    _need(len(headings) == 1)
    doc_row = headings[0].parent.parent.parent
    _need(len(doc_row.find_all('h3')) == 1)
    
    frames = pd.read_html(str(doc_row), displayed_only=displayed_only)
    _need({tuple(df.columns.values) for df in frames} == {('Model', 'Usage')})
    data_rows = itertools.chain.from_iterable(df.to_numpy() for df in frames)
    return {name: _parse_rate(text) for name, text in data_rows}

In [10]:
prices = find_embedding_model_prices()
pprint.pp(prices)

{'Ada': Rate(numerator=Decimal('0.0004'), denominator=1000),
 'Ada v1': Rate(numerator=Decimal('0.0040'), denominator=1000),
 'Babbage v1': Rate(numerator=Decimal('0.0050'), denominator=1000),
 'Curie v1': Rate(numerator=Decimal('0.0200'), denominator=1000),
 'Davinci v1': Rate(numerator=Decimal('0.2000'), denominator=1000)}


In [12]:
print(prices['Ada'])

$0.0004 per 1000 tokens
