# Base64-encoded embedding from `text-embedding-ada-002`

SPDX-License-Identifier: 0BSD

This examines a second-generation Ada embedding in detail.

In [1]:
import collections
import base64
import json
import re
from typing import Iterable

from cheap_repr import cheap_repr
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
import requests
import tabulate

import keys
import parsing

In [2]:
np.set_printoptions(precision=10)

In [3]:
api_key = openai.api_key = keys.get_api_key()

## 1. OpenAI Python library

### 1A. `openai.embeddings_utils.get_embedding`

This is a higher-level interface to `openai.Embedding.create` (below).

In [4]:
lib_utils_embedding = get_embedding(
    text='The food was delicious and the waiter...',
    engine='text-embedding-ada-002',
)
cheap_repr(lib_utils_embedding)

'[0.002253931947052479, -0.00933318305760622, 0.015745779499411583, ..., -0.015288643538951874, -0.019364768639206886, -0.002796780550852418]'

Let's save those results, for comparison to Java:

In [5]:
with open('python-embedding.json', mode='w', encoding='utf-8') as file:
    json.dump(lib_utils_embedding, file, indent=4)

### 1B. `openai.Embedding.create`

In [6]:
lib_response = openai.Embedding.create(
    model="text-embedding-ada-002",
    input="The food was delicious and the waiter...",
)
cheap_repr(lib_response)

"{'object': 'list', 'data': [{'object': 'embedding', 'index': 0, 'embedding': [...]}], 'model': 'text-embedding-ada-002-v2', 'usage': {'prompt_tokens': 8, 'total_tokens': 8}}"

In [7]:
lib_embedding = lib_response.data[0].embedding
cheap_repr(lib_embedding)

'[0.002253931947052479, -0.00933318305760622, 0.015745779499411583, ..., -0.015288643538951874, -0.019364768639206886, -0.002796780550852418]'

Except on text where `get_embedding` does the extra work of converting newlines
to spaces, or **when nondeterminism in the model gives different results for
equivalent queries**, the results with `openai.embeddings_utils.get_embedding`
and `openai.Embedding.create` will agree:

In [8]:
lib_utils_embedding == lib_embedding

True

## 2. POST requests to the OpenAI API endpoint

### 2A. No `encoding_format` (normal way)

In [9]:
default_response = requests.post(
    url='https://api.openai.com/v1/embeddings',
    headers={
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    },
    json={
        'input': 'The food was delicious and the waiter...',
        'model': 'text-embedding-ada-002',
    }
)
default_response.raise_for_status()
default_encoded = default_response.json()['data'][0]['embedding']
default_embedding = np.array(default_encoded, dtype=np.float32)
default_embedding

array([ 0.002253932 , -0.009333183 ,  0.01574578  , ..., -0.015288644 ,
       -0.019364769 , -0.0027967806], dtype=float32)

#### Do the representations always have enough digits for float32?

Since we are working with floating point, when we count digits to determine
precision, we start with the leftmost *nonzero* digit. I will call such digits
*mantissa digits*. A small minority of float32 values require 9 base-10
mantissa digits to represent.

Since decimal representations from the API endpoint omit *trailing* digits if
not required for round-tripping, we should expect most values to show fewer
than 9 mantissa digits. A glance at the output shows most of the numbers have
8, as expected. But it is not clear, at a glance, that 9 digits are ever shown,
since that is rarely needed. This shows that they are:

In [10]:
coordinate_strings = parsing.extract_coordinate_strings(default_response.text)
mantissa_length_groups = parsing.group_by_mantissa_length(coordinate_strings)

In [11]:
mantissa_length_table = [
    (length, len(group))
    for length, group in sorted(mantissa_length_groups.items())
]
tabulate.tabulate(
    mantissa_length_table,
    headers=('digits', 'count'),
    tablefmt='html',
)

digits,count
5,1
6,34
7,425
8,1051
9,25


A small fraction of the coordinates require 9 digits:

In [12]:
ratio_that_need_9 = len(mantissa_length_groups[9]) / len(coordinate_strings)
print(f'{ratio_that_need_9:.2%}')

1.63%


Those coordinates are:

In [13]:
mantissa_length_groups[9]

['-0.0117331445',
 '0.0131045515',
 '0.0114728315',
 '-0.0101331705',
 '0.0147045255',
 '0.0108506195',
 '-0.0112442635',
 '0.0147680165',
 '-0.0144759575',
 '-0.0153902285',
 '-0.0103299925',
 '0.0144759575',
 '0.0131680425',
 '0.0107553825',
 '-0.0144759575',
 '-0.000104363404',
 '-0.0107553825',
 '-0.0121585345',
 '0.0103617385',
 '-0.0152251525',
 '0.0112442635',
 '-0.0101331705',
 '0.0147045255',
 '0.0121902805',
 '-0.0140823135']

### 2B. Passing `base64` as `encoding_format`

In [14]:
base64_response = requests.post(
    url='https://api.openai.com/v1/embeddings',
    headers={
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    },
    json={
        'input': 'The food was delicious and the waiter...',
        'model': 'text-embedding-ada-002',
        'encoding_format': 'base64',
    }
)
base64_response.raise_for_status()
base64_response.json()

{'object': 'list',
 'data': [{'object': 'embedding',
   'index': 0,
   'embedding': 'tLYTOzXqGLxL/YA8M0b/uwdfmrsdNXM8iJIfvEOOHL3IJeK7Ok3rvPWxzjxgppQ8K0FvuzDh0LyYBwI6pauFvBvO6DxDja47kJgdPPK1hrxCUnu8MLSLO/xO5jsQ+ee7JHiAuwSQlzzVnQ48GGm6vAaOOzwFJsM8KXF+PAYkZ7taCA+9gvWHu27f1bz6UbC8b0i8u+WmQDxlRQg8zDCGO2kTnTzKjGy80GYTPLJ8zjsiPju9oAySPOtFtLuZm1G660W0vHDdebuD86s8WgiPvGA8QLyIKbm8UZuGPB/ZjDyrSQu8hPHPOmVyzTy6gsy8pAb+O47Hvju9urW8vrlHO2KjyruE8c+8TswDvEC/mTp/JgU4sH+YO8NZqTzMXF0887SYO76Mgrz8ioc8JXcSvMvz9rsjPF88gbnmuxYCsDvEwSE86t07vS56RjvXmsQ8i/i7PFMu6Dvwe8G8b7EiPEFU17vEKgi9SMQpu86Wojxo2Ok6M4OOOpL+ubxCJqQ7CvJ7PHGCAT3odrG7C5eDvEX1prtvsaI8ZkQavH9S3Lu5sP+8jckavKg9ebxEIuy8g/OrPPDllbw9hkK7xyZQPLF+qjsDvkq9+lEwvdNjybnTzK88OLqJvDZ/1jumqSm9mTF9OrJPCT15tFY8QL4rvLmw/7k98JY81DSoODZ/VrzUy0E8HgfAO10D6Ttz6Qs8PfCWPClEOboAWZy8P1azPKKf87xXOYy7Qb29Oocqp7sEVHY8onKuPMUpmryXNTU8op/zu8CJuDyQmJ08+X51PKoNarstEs47fevRPFdl47wyGxY9oAySuwpbYjyi3AI8wIm4u98JKTztrL67kmgOvAstLzzj1k+4TswDPX+7wrzOliK7GP73PFWV8jwYO4c8j5kLO9bIdzqgoj28KxW

In [15]:
base64_encoded = base64_response.json()['data'][0]['embedding']
base64_encoded

'tLYTOzXqGLxL/YA8M0b/uwdfmrsdNXM8iJIfvEOOHL3IJeK7Ok3rvPWxzjxgppQ8K0FvuzDh0LyYBwI6pauFvBvO6DxDja47kJgdPPK1hrxCUnu8MLSLO/xO5jsQ+ee7JHiAuwSQlzzVnQ48GGm6vAaOOzwFJsM8KXF+PAYkZ7taCA+9gvWHu27f1bz6UbC8b0i8u+WmQDxlRQg8zDCGO2kTnTzKjGy80GYTPLJ8zjsiPju9oAySPOtFtLuZm1G660W0vHDdebuD86s8WgiPvGA8QLyIKbm8UZuGPB/ZjDyrSQu8hPHPOmVyzTy6gsy8pAb+O47Hvju9urW8vrlHO2KjyruE8c+8TswDvEC/mTp/JgU4sH+YO8NZqTzMXF0887SYO76Mgrz8ioc8JXcSvMvz9rsjPF88gbnmuxYCsDvEwSE86t07vS56RjvXmsQ8i/i7PFMu6Dvwe8G8b7EiPEFU17vEKgi9SMQpu86Wojxo2Ok6M4OOOpL+ubxCJqQ7CvJ7PHGCAT3odrG7C5eDvEX1prtvsaI8ZkQavH9S3Lu5sP+8jckavKg9ebxEIuy8g/OrPPDllbw9hkK7xyZQPLF+qjsDvkq9+lEwvdNjybnTzK88OLqJvDZ/1jumqSm9mTF9OrJPCT15tFY8QL4rvLmw/7k98JY81DSoODZ/VrzUy0E8HgfAO10D6Ttz6Qs8PfCWPClEOboAWZy8P1azPKKf87xXOYy7Qb29Oocqp7sEVHY8onKuPMUpmryXNTU8op/zu8CJuDyQmJ08+X51PKoNarstEs47fevRPFdl47wyGxY9oAySuwpbYjyi3AI8wIm4u98JKTztrL67kmgOvAstLzzj1k+4TswDPX+7wrzOliK7GP73PFWV8jwYO4c8j5kLO9bIdzqgoj28KxWYPOCd+LtGifY7EM2QukT2lDvOliI8OH36u84szrslDb67i47nvDODjjtudu88d03MPOFCgLzlEJW863L5O4i

#### Decoding base64 and comparing

Now I'll decode it using the same technique as in [`openai.api_resources.embedding.Embedding.create`](https://github.com/openai/openai-python/blob/040f72efc461d747f04c32126ac4285d0f63b993/openai/api_resources/embedding.py#L15):

In [16]:
buffer = base64.b64decode(base64_encoded)
base64_embedding = np.frombuffer(buffer, dtype=np.float32)
base64_embedding

array([ 0.002253932 , -0.009333183 ,  0.01574578  , ..., -0.015288644 ,
       -0.019364769 , -0.0027967806], dtype=float32)

In [17]:
(default_embedding == base64_embedding).all()

True