# Model nondeterminism (cursory examination)

SPDX-License-Identifier: 0BSD

Embedding the same text with the same model multiple times sometimes returns
different results. This is okay, since the results are sufficiently similar.

This also shows one way to serialize and deserialize embeddings with the
safetensors library.

In [1]:
import logging

import numpy as np
import openai
import openai.embeddings_utils
import safetensors.numpy

In [2]:
logging.basicConfig(level=logging.INFO)
openai.api_key_path = '.api_key'

In [3]:
def save(embeddings, basename):
    safetensors.numpy.save_file(
        tensor_dict={'embeddings': embeddings},
        filename=f'{basename}.safetensors',
    )

In [4]:
def load(basename):
    tensor_dict = safetensors.numpy.load_file(f'{basename}.safetensors')
    return tensor_dict['embeddings']

In [5]:
def embed(texts):
    embeddings = openai.embeddings_utils.get_embeddings(
        texts,
        engine='text-embedding-ada-002',
    )
    return np.array(embeddings, np.float32)

In [6]:
TEXTS = [
    'Gee whiz!',
    'Golly wow!',
    'Well, shucks!',
    'The meeting is this afternoon!',
]

In [7]:
a = embed(TEXTS)

INFO:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=160 request_id=f991af66f6a14ef355a474a3ea7b5701 response_code=200


In [8]:
a @ np.transpose(a)

array([[1.        , 0.9236444 , 0.880864  , 0.78083616],
       [0.9236444 , 1.        , 0.86998427, 0.76927197],
       [0.880864  , 0.86998427, 1.0000004 , 0.7571081 ],
       [0.78083616, 0.76927197, 0.7571081 , 0.99999976]], dtype=float32)

In [9]:
b = embed(TEXTS)

INFO:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=27 request_id=8abc5cf3fefd8fcbea40877589117248 response_code=200


In [10]:
(a == b).all()

False

In [11]:
a - b

array([[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 7.6527242e-05,  3.8238242e-05,  1.2759585e-05, ...,
         3.8238242e-05,  3.1872652e-05, -6.3799322e-05],
       [ 5.8477744e-06, -4.0630111e-05, -1.1708587e-05, ...,
         2.4458393e-05,  2.5135232e-06,  1.1369586e-05]], dtype=float32)

In [12]:
(a - b).min(), (a - b).max()

(-0.00021465123, 0.00017218664)

In [13]:
abs(a - b).mean()

1.8379125e-05

In [14]:
[np.dot(a_row, a_row) for a_row in a]

[1.0000001, 1.0000001, 1.0, 1.0]

In [15]:
[np.dot(b_row, b_row) for b_row in b]

[1.0000001, 1.0000001, 1.0000001, 1.0]

In [16]:
[np.dot(a_row, b_row) for a_row, b_row in zip(a, b)]

[1.0000001, 1.0000001, 0.9999981, 0.99999875]

In [17]:
save(a, 'a')
save(b, 'b')