# Base64-encoded embedding from `text-embedding-ada-002`

SPDX-License-Identifier: 0BSD

This examines a second-generation Ada embedding in detail.

In [1]:
import collections
import base64
import json
import re
from typing import Iterable

from cheap_repr import cheap_repr
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
import requests
import tabulate

import keys
import parsing

In [2]:
np.set_printoptions(precision=10)

In [3]:
api_key = openai.api_key = keys.get_api_key()

## 1. OpenAI Python library

### 1A. `openai.embeddings_utils.get_embedding`

This is a higher-level interface to `openai.Embedding.create` (below).

In [4]:
lib_utils_embedding = get_embedding(
    text='The food was delicious and the waiter...',
    engine='text-embedding-ada-002',
)
cheap_repr(lib_utils_embedding)

'[0.0023063174448907375, -0.009358600713312626, 0.01578390970826149, ..., -0.015301376581192017, -0.01935211382806301, -0.002817421453073621]'

Let's save those results, for comparison to Java:

In [5]:
with open('python-embedding.json', mode='w', encoding='utf-8') as file:
    json.dump(lib_utils_embedding, file, indent=4)

### 1B. `openai.Embedding.create`

In [6]:
lib_response = openai.Embedding.create(
    model="text-embedding-ada-002",
    input="The food was delicious and the waiter...",
)
cheap_repr(lib_response)

"{'object': 'list', 'data': [{'object': 'embedding', 'index': 0, 'embedding': [...]}], 'model': 'text-embedding-ada-002-v2', 'usage': {'prompt_tokens': 8, 'total_tokens': 8}}"

In [7]:
lib_embedding = lib_response.data[0].embedding
cheap_repr(lib_embedding)

'[0.0023063174448907375, -0.009358600713312626, 0.01578390970826149, ..., -0.015301376581192017, -0.01935211382806301, -0.002817421453073621]'

Except on text where `get_embedding` does the extra work of converting newlines
to spaces, or **when nondeterminism in the model gives different results for
equivalent queries**, the results with `openai.embeddings_utils.get_embedding`
and `openai.Embedding.create` will agree:

In [8]:
lib_utils_embedding == lib_embedding

True

## 2. POST requests to the OpenAI API endpoint

### 2A. No `encoding_format` (normal way)

In [9]:
default_response = requests.post(
    url='https://api.openai.com/v1/embeddings',
    headers={
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    },
    json={
        'input': 'The food was delicious and the waiter...',
        'model': 'text-embedding-ada-002',
    }
)
default_response.raise_for_status()
default_encoded = default_response.json()['data'][0]['embedding']
default_embedding = np.array(default_encoded, dtype=np.float32)
default_embedding

array([ 0.0023063174, -0.009358601 ,  0.01578391  , ..., -0.015301377 ,
       -0.019352114 , -0.0028174215], dtype=float32)

#### Do the representations always have enough digits for float32?

Since we are working with floating point, when we count digits to determine
precision, we start with the leftmost *nonzero* digit. I will call such digits
*mantissa digits*. A small minority of float32 values require 9 base-10
mantissa digits to represent.

Since decimal representations from the API endpoint omit *trailing* digits if
not required for round-tripping, we should expect most values to show fewer
than 9 mantissa digits. A glance at the output shows most of the numbers have
8, as expected. But it is not clear, at a glance, that 9 digits are ever shown,
since that is rarely needed. This shows that they are:

In [10]:
coordinate_strings = parsing.extract_coordinate_strings(default_response.text)
mantissa_length_groups = parsing.group_by_mantissa_length(coordinate_strings)

In [11]:
mantissa_length_table = [
    (length, len(group))
    for length, group in sorted(mantissa_length_groups.items())
]
tabulate.tabulate(
    mantissa_length_table,
    headers=('digits', 'count'),
    tablefmt='html',
)

digits,count
5,5
6,55
7,417
8,1041
9,18


A small fraction of the coordinates require 9 digits:

In [12]:
ratio_that_need_9 = len(mantissa_length_groups[9]) / len(coordinate_strings)
print(f'{ratio_that_need_9:.2%}')

1.17%


Those coordinates are:

In [13]:
mantissa_length_groups[9]

['0.0117331715',
 '-0.0131299775',
 '0.0107935015',
 '0.0134855285',
 '0.0146537665',
 '-0.0128379185',
 '-0.0134220375',
 '0.0134855285',
 '-0.0140696475',
 '-0.0114093665',
 '-0.0152378855',
 '0.0105966795',
 '-0.0146537665',
 '-0.0137140965',
 '-0.0152378855',
 '-0.0102728745',
 '0.0154664535',
 '0.0110855615']

### 2B. Passing `base64` as `encoding_format`

In [14]:
base64_response = requests.post(
    url='https://api.openai.com/v1/embeddings',
    headers={
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    },
    json={
        'input': 'The food was delicious and the waiter...',
        'model': 'text-embedding-ada-002',
        'encoding_format': 'base64',
    }
)
base64_response.raise_for_status()
base64_response.json()

{'object': 'list',
 'data': [{'object': 'embedding',
   'index': 0,
   'embedding': 'liUXO9FUGbxCTYE8ygb+u0Xkmrt5lXI8j7cgvFqOHL0ihuG7XU3rvNF8zjwYwZQ8U3hqu9VL0bxTKAA61VuFvD7O6DxoYq87p5gdPBGLh7yqh3u8iOmLO/DD5zt2Lui7Xv2AuzdAlzzqnQ488TO6vAbeOzwGdsM8WkZ/PKO5ZrvO7Q69ekWIuwl11by1bLC8b5i8u308QDwmNQk8uauFOyIunTzit2u8+EESPOYm0DuBczu9c9eRPN/As7v4aUe66pW0vJXdebvfKKw8U1iPvAKnQLzcibi8ImaGPFCJjDwDfwu8ZIvSOuNXzTxd7cy81Nv+OzAyvzuWhbW8wAlIO9Wzybseh8+8kiYFvB/Hkjp61bU3enWXO5kkqTztXF08jx+ZO1f3grz12oc8O3cSvD7+97vKpl88QQXku3M3sDsCRyI8p/g7vUGlRTsQS8Q86i28PGtZ5zvxy8G8yuYiPCn017vZKgi9XvUmu0V8ojyAA+k6ESOPOg3kubw0oaM7L/J7PCadAT1sMbK7fXyDvGyZqrvmlqI8+NkZvO1c3bvfsP+8YZQavM09ebwwwuy8uaOrPItQlry8OkW7fWxPPN8orDuAo0q9j+cvvWTzyrmZvLA8q5+JvJm01jthxCm9ygZ+OsdPCT3c6VY8UOkqvBTCArqWJZc85zYEOR4fV7zKRkE8MDK/O0ij6TvKHgw8stWWPDAyP7r7qJy8+3CzPMaf87xTWI+7Vre/OtxZqbvmHnY8/6euPNwpmryB2zM8CdXzux6/uDw+3pw8mRR1PBBDarvjV807WrbRPLya47xJGxY9NHGUuyxbYjy23AI8Om+4uxS6KDxWt7+7bwgPvBtYLjwGTg63pAEEPblrwrz7QCS7+8j3PLzK8jxvcIc8Hy8LO1bfdDp6bT28hEq

In [15]:
base64_encoded = base64_response.json()['data'][0]['embedding']
base64_encoded

'liUXO9FUGbxCTYE8ygb+u0Xkmrt5lXI8j7cgvFqOHL0ihuG7XU3rvNF8zjwYwZQ8U3hqu9VL0bxTKAA61VuFvD7O6DxoYq87p5gdPBGLh7yqh3u8iOmLO/DD5zt2Lui7Xv2AuzdAlzzqnQ488TO6vAbeOzwGdsM8WkZ/PKO5ZrvO7Q69ekWIuwl11by1bLC8b5i8u308QDwmNQk8uauFOyIunTzit2u8+EESPOYm0DuBczu9c9eRPN/As7v4aUe66pW0vJXdebvfKKw8U1iPvAKnQLzcibi8ImaGPFCJjDwDfwu8ZIvSOuNXzTxd7cy81Nv+OzAyvzuWhbW8wAlIO9Wzybseh8+8kiYFvB/Hkjp61bU3enWXO5kkqTztXF08jx+ZO1f3grz12oc8O3cSvD7+97vKpl88QQXku3M3sDsCRyI8p/g7vUGlRTsQS8Q86i28PGtZ5zvxy8G8yuYiPCn017vZKgi9XvUmu0V8ojyAA+k6ESOPOg3kubw0oaM7L/J7PCadAT1sMbK7fXyDvGyZqrvmlqI8+NkZvO1c3bvfsP+8YZQavM09ebwwwuy8uaOrPItQlry8OkW7fWxPPN8orDuAo0q9j+cvvWTzyrmZvLA8q5+JvJm01jthxCm9ygZ+OsdPCT3c6VY8UOkqvBTCArqWJZc85zYEOR4fV7zKRkE8MDK/O0ij6TvKHgw8stWWPDAyP7r7qJy8+3CzPMaf87xTWI+7Vre/OtxZqbvmHnY8/6euPNwpmryB2zM8CdXzux6/uDw+3pw8mRR1PBBDarvjV807WrbRPLya47xJGxY9NHGUuyxbYjy23AI8Om+4uxS6KDxWt7+7bwgPvBtYLjwGTg63pAEEPblrwrz7QCS7+8j3PLzK8jxvcIc8Hy8LO1bfdDp6bT28hEqYPDMp97uuvvY7RbSLurnblDsCRyI8zT35uxu4zLvjJ767ro7nvJ2TjTtPQe88lU3MPFMogLzDsJW8dl73OzT

#### Decoding base64 and comparing

Now I'll decode it using the same technique as in [`openai.api_resources.embedding.Embedding.create`](https://github.com/openai/openai-python/blob/040f72efc461d747f04c32126ac4285d0f63b993/openai/api_resources/embedding.py#L15):

In [16]:
buffer = base64.b64decode(base64_encoded)
base64_embedding = np.frombuffer(buffer, dtype=np.float32)
base64_embedding

array([ 0.0023063174, -0.009358601 ,  0.01578391  , ..., -0.015301377 ,
       -0.019352114 , -0.0028174215], dtype=float32)

In [17]:
(default_embedding == base64_embedding).all()

True