# Base64-encoded embedding from `text-embedding-ada-002`

SPDX-License-Identifier: 0BSD

This examines a second-generation Ada embedding in detail.

In [1]:
import collections
import base64
import json
import re
from typing import Iterable

from cheap_repr import cheap_repr
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
import requests
import tabulate

import keys
import parsing

In [2]:
np.set_printoptions(precision=10)

In [3]:
api_key = openai.api_key = keys.get_api_key()

## 1. OpenAI Python library

### 1A. `openai.embeddings_utils.get_embedding`

This is a higher-level interface to `openai.Embedding.create` (below).

In [4]:
lib_utils_embedding = get_embedding(
    text='The food was delicious and the waiter...',
    engine='text-embedding-ada-002',
)
cheap_repr(lib_utils_embedding)

'[0.002306425478309393, -0.009327292442321777, 0.015797346830368042, ..., -0.015327490866184235, -0.01937841810286045, -0.0028842221945524216]'

Let's save those results, for comparison to Java:

In [5]:
with open('python-embedding.json', mode='w', encoding='utf-8') as file:
    json.dump(lib_utils_embedding, file, indent=4)

### 1B. `openai.Embedding.create`

In [6]:
lib_response = openai.Embedding.create(
    model="text-embedding-ada-002",
    input="The food was delicious and the waiter...",
)
cheap_repr(lib_response)

"{'object': 'list', 'data': [{'object': 'embedding', 'index': 0, 'embedding': [...]}], 'model': 'text-embedding-ada-002-v2', 'usage': {'prompt_tokens': 8, 'total_tokens': 8}}"

In [7]:
lib_embedding = lib_response.data[0].embedding
cheap_repr(lib_embedding)

'[0.002306425478309393, -0.009327292442321777, 0.015797346830368042, ..., -0.015327490866184235, -0.01937841810286045, -0.0028842221945524216]'

Except on text where `get_embedding` does the extra work of converting newlines
to spaces, or **when nondeterminism in the model gives different results for
equivalent queries**, the results with `openai.embeddings_utils.get_embedding`
and `openai.Embedding.create` will agree:

In [8]:
lib_utils_embedding == lib_embedding

True

## 2. POST requests to the OpenAI API endpoint

### 2A. No `encoding_format` (normal way)

In [9]:
default_response = requests.post(
    url='https://api.openai.com/v1/embeddings',
    headers={
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    },
    json={
        'input': 'The food was delicious and the waiter...',
        'model': 'text-embedding-ada-002',
    }
)
default_response.raise_for_status()
default_encoded = default_response.json()['data'][0]['embedding']
default_embedding = np.array(default_encoded, dtype=np.float32)
default_embedding

array([ 0.0023064255, -0.009327292 ,  0.015797347 , ..., -0.015327491 ,
       -0.019378418 , -0.0028842222], dtype=float32)

#### Do the representations always have enough digits for float32?

Since we are working with floating point, when we count digits to determine
precision, we start with the leftmost *nonzero* digit. I will call such digits
*mantissa digits*. A small minority of float32 values require 9 base-10
mantissa digits to represent.

Since decimal representations from the API endpoint omit *trailing* digits if
not required for round-tripping, we should expect most values to show fewer
than 9 mantissa digits. A glance at the output shows most of the numbers have
8, as expected. But it is not clear, at a glance, that 9 digits are ever shown,
since that is rarely needed. This shows that they are:

In [10]:
coordinate_strings = parsing.extract_coordinate_strings(default_response.text)
mantissa_length_groups = parsing.group_by_mantissa_length(coordinate_strings)

In [11]:
mantissa_length_table = [
    (length, len(group))
    for length, group in sorted(mantissa_length_groups.items())
]
tabulate.tabulate(
    mantissa_length_table,
    headers=('digits', 'count'),
    tablefmt='html',
)

digits,count
5,6
6,50
7,392
8,1068
9,20


A small fraction of the coordinates require 9 digits:

In [12]:
ratio_that_need_9 = len(mantissa_length_groups[9]) / len(coordinate_strings)
print(f'{ratio_that_need_9:.2%}')

1.30%


Those coordinates are:

In [13]:
mantissa_length_groups[9]

['-0.0143750785',
 '0.0149592245',
 '0.0138798235',
 '-0.0138798235',
 '0.0111368755',
 '-0.0110479845',
 '-0.0141718965',
 '-0.0144639695',
 '-0.0128004225',
 '0.0132956775',
 '0.000117464195',
 '-0.107686095',
 '0.0127115315',
 '-0.0101146195',
 '-0.0109019475',
 '-0.0144639695',
 '0.0110479845',
 '-0.0117781665',
 '0.0143750785',
 '0.0104638375']

### 2B. Passing `base64` as `encoding_format`

In [14]:
base64_response = requests.post(
    url='https://api.openai.com/v1/embeddings',
    headers={
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    },
    json={
        'input': 'The food was delicious and the waiter...',
        'model': 'text-embedding-ada-002',
        'encoding_format': 'base64',
    }
)
base64_response.raise_for_status()
base64_response.json()

{'object': 'list',
 'data': [{'object': 'embedding',
   'index': 0,
   'embedding': 'ZicXO4DRGLxwaYE84t7+uy/BmbtvbXM8AiQhvH7FHL3/E+C77BrrvBUfzzz7cpQ8gFpsu7Uz0byX6AM69ceFvAjR6DzhSa876X8dPEXShrz+lHy8rnqNOxey5Tur8ea7Y5SAu9HhlzxqRQ086QC6vAOrOzz/ksM8aUl/PIJmaLuF7w69KRyJux/i1bzuHrC8pcW7uxtJQTwpHAk8YIKGO0hlnTxzhWu8uUMSPLY5zzt9QDu9MtmRPPnntLv0yT66cn20vMg0fbtpwKs88KmPvHouQbyZ9ri8ezKGPP+KjDxedgq8MtvKOllazTxMhcy8fkQAPObuvztkorW8QshDO1tmybuA2c+8luIFvNPtkzpx8Js437yWO6AmqTyUX108S3eXO6XDgrw394c8npMSvKPB9rtCSWA8Z7zmu/z5rjuJjiE8RuA7vdYHRTvJMsQ8U7W8PD4x6DsoHsK8OH4iPGAL2rscRwi9eacmu77oojx0kec676OROqbLubwBGKU78b97PIsZAT23uLK7LC6DvHihqLtgA6M8mnsavF7/3bvvs/+8ZBsbvIcLebw8Jey8rPWrPBUdlrxtWUC7PaRPPAzhqTuQwEq9jzkwvULCxbmPObA887uJvNus1Ttqxim92SF2Og5sCT2Yd1U8nyArvCwuA7rfvJY8NesLObIhV7wNbkI8KSTAO5wQ6jsaOww8gdeWPDTnRrrB+py8ZaizPPXX87y6SZC7pcW7Ohq8qLsPgnU8ucSuPNDbmby2sjQ8Aq30u3BxuDzB+pw8iRd1PI0vbbs+qs076o3SPDNo47y3NxY9gt2Uu639Yjzp+AI8fky3u5NRKDxSr767XGoOvDJaLjyr7y24s5gDPTXzwrxETSW7NwH4PB5j8jyxjIc8eSAMOyBvbjpF2j28FBe

In [15]:
base64_encoded = base64_response.json()['data'][0]['embedding']
base64_encoded

'ZicXO4DRGLxwaYE84t7+uy/BmbtvbXM8AiQhvH7FHL3/E+C77BrrvBUfzzz7cpQ8gFpsu7Uz0byX6AM69ceFvAjR6DzhSa876X8dPEXShrz+lHy8rnqNOxey5Tur8ea7Y5SAu9HhlzxqRQ086QC6vAOrOzz/ksM8aUl/PIJmaLuF7w69KRyJux/i1bzuHrC8pcW7uxtJQTwpHAk8YIKGO0hlnTxzhWu8uUMSPLY5zzt9QDu9MtmRPPnntLv0yT66cn20vMg0fbtpwKs88KmPvHouQbyZ9ri8ezKGPP+KjDxedgq8MtvKOllazTxMhcy8fkQAPObuvztkorW8QshDO1tmybuA2c+8luIFvNPtkzpx8Js437yWO6AmqTyUX108S3eXO6XDgrw394c8npMSvKPB9rtCSWA8Z7zmu/z5rjuJjiE8RuA7vdYHRTvJMsQ8U7W8PD4x6DsoHsK8OH4iPGAL2rscRwi9eacmu77oojx0kec676OROqbLubwBGKU78b97PIsZAT23uLK7LC6DvHihqLtgA6M8mnsavF7/3bvvs/+8ZBsbvIcLebw8Jey8rPWrPBUdlrxtWUC7PaRPPAzhqTuQwEq9jzkwvULCxbmPObA887uJvNus1Ttqxim92SF2Og5sCT2Yd1U8nyArvCwuA7rfvJY8NesLObIhV7wNbkI8KSTAO5wQ6jsaOww8gdeWPDTnRrrB+py8ZaizPPXX87y6SZC7pcW7Ohq8qLsPgnU8ucSuPNDbmby2sjQ8Aq30u3BxuDzB+pw8iRd1PI0vbbs+qs076o3SPDNo47y3NxY9gt2Uu639Yjzp+AI8fky3u5NRKDxSr767XGoOvDJaLjyr7y24s5gDPTXzwrxETSW7NwH4PB5j8jyxjIc8eSAMOyBvbjpF2j28FBeYPHo2+LscV/Y7/oSOukx9lTv0SCI8hwv5uxYlzbu+ab+7dJHnvMkqjTtxee88CFDMPDsPgLzFEpW8Uav5O0+

#### Decoding base64 and comparing

Now I'll decode it using the same technique as in [`openai.api_resources.embedding.Embedding.create`](https://github.com/openai/openai-python/blob/040f72efc461d747f04c32126ac4285d0f63b993/openai/api_resources/embedding.py#L15):

In [16]:
buffer = base64.b64decode(base64_encoded)
base64_embedding = np.frombuffer(buffer, dtype=np.float32)
base64_embedding

array([ 0.0023064255, -0.009327292 ,  0.015797347 , ..., -0.015327491 ,
       -0.019378418 , -0.0028842222], dtype=float32)

In [17]:
(default_embedding == base64_embedding).all()

True