# `fr2ex.embedding.embed_many` demo

For use on the actual data set of repository names, see `main.ipynb`.

In [1]:
import logging

import numpy as np

import fr2ex

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
TEXTS = [
    'Gee whiz!',
    'Golly wow!',
    'Well, shucks!',
    'The meeting is this afternoon!',
]

## With caching enabled

This is the normal way to use it.

In [4]:
embeddings = fr2ex.embedding.embed_many(TEXTS)

INFO:root:Querying OpenAI embeddings endpoint.
INFO:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=331 request_id=bb640ab0b4aaaa62fe2d3a1ef7b406b7 response_code=200


In [5]:
embeddings @ np.transpose(embeddings)

array([[0.99999996, 0.9225741 , 0.88094956, 0.78086158],
       [0.9225741 , 0.99999993, 0.8698922 , 0.76907447],
       [0.88094956, 0.8698922 , 0.99999995, 0.7570367 ],
       [0.78086158, 0.76907447, 0.7570367 , 0.99999999]])

## Comparing separate serializations

In [6]:
import msgpack

with open('embeddings-c5e7a088e88de307e7076d8e19ef5913-old.msgpack', 'rb') as file:
    loaded1 = msgpack.load(file)

with open('embeddings-c5e7a088e88de307e7076d8e19ef5913.msgpack', 'rb') as file:
    loaded2 = msgpack.load(file)

(loaded1 == loaded2).all()

False

In [7]:
loaded2 - loaded1

array([[-3.67610715e-04,  1.46796799e-03,  5.27247787e-04, ...,
        -6.94829971e-04, -9.71567351e-04, -2.23577023e-04],
       [-2.65697017e-05, -3.48603353e-05,  2.75913626e-05, ...,
        -2.74460763e-05,  8.53426754e-05, -6.71409070e-05],
       [ 1.63228251e-05,  2.37338245e-05,  4.18960117e-05, ...,
        -1.72853470e-06, -7.73686916e-05,  3.59173864e-05],
       [-4.50620428e-06, -3.17757949e-05, -2.97678635e-05, ...,
         3.03797424e-05, -3.18257371e-05, -2.09417194e-05]])

In [8]:
(embeddings == loaded1).all()

False

In [9]:
(embeddings == loaded2).all()

True

In [11]:
abs(loaded2 - loaded1).max()

0.003502288367599249

In [20]:
[np.dot(row1, row2) for row1, row2 in zip(loaded1, loaded2)]

[0.9993936427215655, 0.999998359704665, 0.9999980855722359, 0.9999982262121097]

In [22]:
[np.dot(row1, row1) for row1 in loaded1]

[0.9999999798377719,
 1.0000000998125569,
 0.9999999759877776,
 1.0000000403818028]

## With caching disabled

This currently relies on implementation details of `fr2ex` and should only be
done for investigating specific unexpected behavior. It's included in the
notebook because saved `.msgpack` files for embeddings of the same texts were
different binary files, so I wanted to make sure this was due to nondeterminism
in the results from the model, rather than a problem with serialization.

In [12]:
fr2ex._task._ensure_api_key()

In [13]:
embeddings2 = fr2ex.embedding.embed_many.__wrapped__(TEXTS)

INFO:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=451 request_id=8871eed37edfc878d8ee2239873407ce response_code=200


In [14]:
embeddings2.shape

(4, 1536)

In [15]:
embeddings2 - embeddings

array([[ 1.98085792e-04, -2.05242424e-04, -4.22066078e-04, ...,
         5.24587929e-04,  2.92179175e-04,  9.69542190e-04],
       [ 7.07395375e-05,  9.71162226e-05, -2.33948231e-05, ...,
         3.66698951e-05, -1.03009865e-04,  6.16442412e-05],
       [ 1.12317502e-06,  5.85597008e-05, -8.25151801e-06, ...,
        -1.78534538e-05,  8.04308802e-05, -6.91041350e-06],
       [-2.00229697e-05,  4.67174686e-06, -3.76319513e-05, ...,
         1.15558505e-05,  2.03077216e-05,  1.14124268e-05]])

In [16]:
(embeddings2 == embeddings).all()

False

In [19]:
(embeddings2 == loaded1).all()

False