# Model nondeterminism (cursory examination)

SPDX-License-Identifier: 0BSD

Embedding the same text with the same model multiple times sometimes returns
different results. This is okay, since the results are sufficiently similar.

This also shows one way to serialize and deserialize embeddings with the
safetensors library.

In [1]:
import logging

import numpy as np
import openai
import openai.embeddings_utils
import safetensors.numpy

In [2]:
logging.basicConfig(level=logging.INFO)
openai.api_key_path = '.api_key'

In [3]:
def save(embeddings, basename):
    safetensors.numpy.save_file(
        tensor_dict={'embeddings': embeddings},
        filename=f'{basename}.safetensors',
    )

In [4]:
def load(basename):
    tensor_dict = safetensors.numpy.load_file(f'{basename}.safetensors')
    return tensor_dict['embeddings']

In [5]:
def embed(texts):
    embeddings = openai.embeddings_utils.get_embeddings(
        texts,
        engine='text-embedding-ada-002',
    )
    return np.array(embeddings, np.float32)

In [6]:
TEXTS = [
    'Gee whiz!',
    'Golly wow!',
    'Well, shucks!',
    'The meeting is this afternoon!',
]

In [7]:
a = embed(TEXTS)

In [8]:
a @ np.transpose(a)

array([[1.        , 0.92359984, 0.8808113 , 0.7807301 ],
       [0.92359984, 1.0000001 , 0.86994064, 0.7691529 ],
       [0.8808113 , 0.86994064, 1.        , 0.75710803],
       [0.7807301 , 0.7691529 , 0.75710803, 1.0000001 ]], dtype=float32)

In [9]:
b = embed(TEXTS)

In [10]:
(a == b).all()

False

In [11]:
a - b

array([[ 7.6461583e-05, -5.5040931e-05,  2.0070001e-05, ...,
        -1.9634143e-05, -1.1863187e-05, -9.7081065e-06],
       [ 3.8404018e-05,  5.1384559e-06,  3.7722290e-05, ...,
         1.5497208e-05,  7.6873228e-05, -1.5275553e-05],
       [ 6.6961627e-05,  5.0993636e-05,  3.5078265e-05, ...,
        -2.2351742e-08,  1.2743287e-05, -2.9802322e-08],
       [ 2.7018599e-05,  1.1659693e-05, -4.1065738e-05, ...,
         5.4363161e-05, -2.5238958e-05,  2.8731301e-05]], dtype=float32)

In [12]:
(a - b).min(), (a - b).max()

(-0.00031512976, 0.0003888011)

In [13]:
abs(a - b).mean()

3.4732115e-05

In [14]:
[np.dot(a_row, a_row) for a_row in a]

[0.99999994, 1.0, 1.0, 1.0]

In [15]:
[np.dot(b_row, b_row) for b_row in b]

[1.0000001, 1.0, 0.9999999, 1.0000002]

In [16]:
[np.dot(a_row, b_row) for a_row, b_row in zip(a, b)]

[0.9999987, 0.99999833, 0.9999981, 0.99999875]

In [17]:
save(a, 'a')
save(b, 'b')