# Base64-encoded embedding from `text-embedding-ada-002`

SPDX-License-Identifier: 0BSD

This examines a second-generation Ada embedding in detail.

In [1]:
import collections
import base64
import json
import re
from typing import Iterable

from cheap_repr import cheap_repr
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
import requests
import tabulate

import keys
import parsing

In [2]:
np.set_printoptions(precision=10)

In [3]:
api_key = openai.api_key = keys.get_api_key()

## 1. OpenAI Python library

### 1A. `openai.embeddings_utils.get_embedding`

This is a higher-level interface to `openai.Embedding.create` (below).

In [4]:
lib_utils_embedding = get_embedding(
    text='The food was delicious and the waiter...',
    engine='text-embedding-ada-002',
)
cheap_repr(lib_utils_embedding)

'[0.002235695719718933, -0.009273056872189045, 0.0158150065690279, ..., -0.015357705764472485, -0.019397201016545296, -0.002861309563741088]'

Let's save those results, for comparison to Java:

In [5]:
with open('python-embedding.json', mode='w', encoding='utf-8') as file:
    json.dump(lib_utils_embedding, file, indent=4)

### 1B. `openai.Embedding.create`

In [6]:
lib_response = openai.Embedding.create(
    model="text-embedding-ada-002",
    input="The food was delicious and the waiter...",
)
cheap_repr(lib_response)

"{'object': 'list', 'data': [{'object': 'embedding', 'index': 0, 'embedding': [...]}], 'model': 'text-embedding-ada-002-v2', 'usage': {'prompt_tokens': 8, 'total_tokens': 8}}"

In [7]:
lib_embedding = lib_response.data[0].embedding
cheap_repr(lib_embedding)

'[0.002235695719718933, -0.009273056872189045, 0.0158150065690279, ..., -0.015357705764472485, -0.019397201016545296, -0.002861309563741088]'

Except on text where `get_embedding` does the extra work of converting newlines
to spaces, or **when nondeterminism in the model gives different results for
equivalent queries**, the results with `openai.embeddings_utils.get_embedding`
and `openai.Embedding.create` will agree:

In [8]:
lib_utils_embedding == lib_embedding

True

## 2. POST requests to the OpenAI API endpoint

### 2A. No `encoding_format` (normal way)

In [9]:
default_response = requests.post(
    url='https://api.openai.com/v1/embeddings',
    headers={
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    },
    json={
        'input': 'The food was delicious and the waiter...',
        'model': 'text-embedding-ada-002',
    }
)
default_response.raise_for_status()
default_encoded = default_response.json()['data'][0]['embedding']
default_embedding = np.array(default_encoded, dtype=np.float32)
default_embedding

array([ 0.0022356957, -0.009273057 ,  0.015815007 , ..., -0.015357706 ,
       -0.019397201 , -0.0028613096], dtype=float32)

#### Do the representations always have enough digits for float32?

Since we are working with floating point, when we count digits to determine
precision, we start with the leftmost *nonzero* digit. I will call such digits
*mantissa digits*. A small minority of float32 values require 9 base-10
mantissa digits to represent.

Since decimal representations from the API endpoint omit *trailing* digits if
not required for round-tripping, we should expect most values to show fewer
than 9 mantissa digits. A glance at the output shows most of the numbers have
8, as expected. But it is not clear, at a glance, that 9 digits are ever shown,
since that is rarely needed. This shows that they are:

In [10]:
coordinate_strings = parsing.extract_coordinate_strings(default_response.text)
mantissa_length_groups = parsing.group_by_mantissa_length(coordinate_strings)

In [11]:
mantissa_length_table = [
    (length, len(group))
    for length, group in sorted(mantissa_length_groups.items())
]
tabulate.tabulate(
    mantissa_length_table,
    headers=('digits', 'count'),
    tablefmt='html',
)

digits,count
5,5
6,40
7,444
8,1021
9,26


A small fraction of the coordinates require 9 digits:

In [12]:
ratio_that_need_9 = len(mantissa_length_groups[9]) / len(coordinate_strings)
print(f'{ratio_that_need_9:.2%}')

1.69%


Those coordinates are:

In [13]:
mantissa_length_groups[9]

['-0.0124868695',
 '0.0141255325',
 '0.0147352675',
 '-0.0109879365',
 '0.0144558055',
 '-0.0130966045',
 '-0.0128171425',
 '0.0111276675',
 '-0.0148749985',
 '-0.0145955365',
 '-0.0110578025',
 '0.0119469995',
 '0.0144558055',
 '0.0124170035',
 '-0.0114071295',
 '0.0139858015',
 '-0.0152052725',
 '-0.0110578025',
 '-0.0132363355',
 '-0.0150655415',
 '0.0103782015',
 '-0.0100987395',
 '-0.0144558055',
 '-0.0137063395',
 '-0.0139858015',
 '0.0145955365']

### 2B. Passing `base64` as `encoding_format`

In [14]:
base64_response = requests.post(
    url='https://api.openai.com/v1/embeddings',
    headers={
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    },
    json={
        'input': 'The food was delicious and the waiter...',
        'model': 'text-embedding-ada-002',
        'encoding_format': 'base64',
    }
)
base64_response.raise_for_status()
base64_response.json()

{'object': 'list',
 'data': [{'object': 'embedding',
   'index': 0,
   'embedding': 'wIQSOwXuF7x5joE8esj/u31Cm7sm4XI81lsgvBbSHL2OkOC7zy3rvLgvzzwwtJQ8AE1uu+Ap0bz6oQg6ELiFvAQZ6TzD0q47pTwdPAUth7ycaXu85IuLO9L55TvHbue7W7mAu0wjmDwpWw48IkW6vHOfOzzKUsM8oyh/PLFYartc4A69lJeHuwa+1bwULbC8Xu+7u/UYQDzl8Qg8TWKHO10HnTxemGu8HWoSPGVv0DuHTzu9RsqRPBBhtbuVKFa6JRG1vO4pershHqw815qPvJkzQLzR6ri8i3KGPAJhjDz4Owu8ZAnTOjRQzTzORcy8oyj/O5kzwDv8sLW8fetKO9rQyruPz8+8v10EvPKjlTpsXhY4ijOXO+/+qDxdcV08Be6XO7Y4g7wjAog8ZJ8SvEx197tS5l48Q4/luyjdrzuicCI8utQ7vfkLSTt3ksQ8Agq8PJOD6TvV3cG8HCsjPGA92LsPUgi9fmkpu+mlojzHbuc6PKWQOtoPurw6AKQ7cgl8PDJZAT0dUrG7RaODvIr0p7sx26I8QZgZvKSm3bt6yP+8Sr0avNBUebw1OOy8fQOsPN3zlbzfAkO7wlRQPMaeqTs2tkq9W2Iwvdq46bmFwq88p+GJvCST1jsihCm9+bR4Ohh3CT1N81U8Xy4rvCVoBbpD/pY86jbxOGvIVrzALUI8wpO/O/iN6juRyww8V66WPDHDQbpyt5y8eNGzPIzr87y5xY67DZW6OmCUqLuBYHU8Zu2uPNACmrw6wTQ8G1b0u4m1uDztcZ08gWB1PF6Ya7vDus07tsnRPHh647yBDhY9gnSTu8s6YjyYYwI8I6u3u3VEKDxcib67XOAOvKT9LTw+tD24MPMDPSY4w7y+3yW7IxX4PLVL8zwFLYc8z9sLO/m0eDqamT28lFi

In [15]:
base64_encoded = base64_response.json()['data'][0]['embedding']
base64_encoded

'wIQSOwXuF7x5joE8esj/u31Cm7sm4XI81lsgvBbSHL2OkOC7zy3rvLgvzzwwtJQ8AE1uu+Ap0bz6oQg6ELiFvAQZ6TzD0q47pTwdPAUth7ycaXu85IuLO9L55TvHbue7W7mAu0wjmDwpWw48IkW6vHOfOzzKUsM8oyh/PLFYartc4A69lJeHuwa+1bwULbC8Xu+7u/UYQDzl8Qg8TWKHO10HnTxemGu8HWoSPGVv0DuHTzu9RsqRPBBhtbuVKFa6JRG1vO4pershHqw815qPvJkzQLzR6ri8i3KGPAJhjDz4Owu8ZAnTOjRQzTzORcy8oyj/O5kzwDv8sLW8fetKO9rQyruPz8+8v10EvPKjlTpsXhY4ijOXO+/+qDxdcV08Be6XO7Y4g7wjAog8ZJ8SvEx197tS5l48Q4/luyjdrzuicCI8utQ7vfkLSTt3ksQ8Agq8PJOD6TvV3cG8HCsjPGA92LsPUgi9fmkpu+mlojzHbuc6PKWQOtoPurw6AKQ7cgl8PDJZAT0dUrG7RaODvIr0p7sx26I8QZgZvKSm3bt6yP+8Sr0avNBUebw1OOy8fQOsPN3zlbzfAkO7wlRQPMaeqTs2tkq9W2Iwvdq46bmFwq88p+GJvCST1jsihCm9+bR4Ohh3CT1N81U8Xy4rvCVoBbpD/pY86jbxOGvIVrzALUI8wpO/O/iN6juRyww8V66WPDHDQbpyt5y8eNGzPIzr87y5xY67DZW6OmCUqLuBYHU8Zu2uPNACmrw6wTQ8G1b0u4m1uDztcZ08gWB1PF6Ya7vDus07tsnRPHh647yBDhY9gnSTu8s6YjyYYwI8I6u3u3VEKDxcib67XOAOvKT9LTw+tD24MPMDPSY4w7y+3yW7IxX4PLVL8zwFLYc8z9sLO/m0eDqamT28lFiYPBiK+btMdfc7KPWQuqFJlDvppSI8+bT4uz51zrtI2b67gDnnvOIljjutjO88KyvMPHrIf7wGVJW8sn/4OyA

#### Decoding base64 and comparing

Now I'll decode it using the same technique as in [`openai.api_resources.embedding.Embedding.create`](https://github.com/openai/openai-python/blob/040f72efc461d747f04c32126ac4285d0f63b993/openai/api_resources/embedding.py#L15):

In [16]:
buffer = base64.b64decode(base64_encoded)
base64_embedding = np.frombuffer(buffer, dtype=np.float32)
base64_embedding

array([ 0.0022356957, -0.009273057 ,  0.015815007 , ..., -0.015357706 ,
       -0.019397201 , -0.0028613096], dtype=float32)

In [17]:
(default_embedding == base64_embedding).all()

True