In [90]:
import itertools

from colorama import Back, Fore, Style
import numpy as np
import openai
import tiktoken

In [2]:
openai.api_key_path = '.openai_key'

In [3]:
texts = [
    'That is a happy person',
    'That is a happy dog',
    'That is a very happy person',
    'Today is a sunny day',
]

In [4]:
results = openai.Embedding.create(
    input=texts,
    model='text-embedding-ada-002',
)

In [13]:
results.data[0].keys()

dict_keys(['object', 'index', 'embedding'])

In [14]:
[datum.object for datum in results.data]

['embedding', 'embedding', 'embedding', 'embedding']

In [27]:
[datum.index for datum in results.data]

[0, 1, 2, 3]

In [16]:
pairs = itertools.pairwise(datum.index for datum in results.data)
all(lhs < rhs for lhs, rhs in pairs)

True

In [19]:
v1, v2, v3, v4 = (
    np.array(datum.embedding, dtype=np.float32)
    for datum in results.data
)

In [20]:
[embedding.shape for embedding in (v1, v2, v3, v4)]

[(1536,), (1536,), (1536,), (1536,)]

In [21]:
[np.linalg.norm(embedding) for embedding in (v1, v2, v3, v4)]

[0.99999994, 1.0, 0.9999999, 0.9999999]

In [22]:
np.dot(v1, v2)

0.9301601

In [23]:
np.dot(v1, v3)

0.98347116

In [24]:
np.dot(v1, v4)

0.8226278

In [25]:
with open('the_open_window.txt', encoding='utf-8') as file:
    tow = file.read().strip().replace('\n', ' ')

with open('the_open_window_modified.txt', encoding='utf-8') as file:
    tow_modified = file.read().strip().replace('\n', ' ')

assert tow != tow_modified

tow_results = openai.Embedding.create(
    input=[tow, tow_modified],
    model='text-embedding-ada-002',
)

In [26]:
[datum.object for datum in tow_results.data]

['embedding', 'embedding']

In [28]:
[datum.index for datum in tow_results.data]

[0, 1]

In [29]:
pairs = itertools.pairwise(datum.index for datum in tow_results.data)
all(lhs < rhs for lhs, rhs in pairs)

True

In [30]:
w1, w2 = (
    np.array(datum.embedding, dtype=np.float32)
    for datum in tow_results.data
)

In [31]:
(w1 == w2).all()

False

In [32]:
np.linalg.norm(w1 - w2)

0.08089018

In [33]:
np.dot(w1, w2)

0.9967284

In [38]:
results.usage

<OpenAIObject at 0x22ae654dc10> JSON: {
  "prompt_tokens": 21,
  "total_tokens": 21
}

In [36]:
tow_results.usage

<OpenAIObject at 0x22ae6ad2c90> JSON: {
  "prompt_tokens": 3114,
  "total_tokens": 3114
}

In [47]:
enc = tiktoken.get_encoding('cl100k_base')

In [48]:
enc.encode('That is a very happy person')

[4897, 374, 264, 1633, 6380, 1732]

In [55]:
enc.encode('altogether doctrinaire')

[76777, 3522, 61990, 68976]

In [95]:
COLORS = (
    Style.BRIGHT + Fore.BLACK + Back.LIGHTGREEN_EX,
    Style.BRIGHT + Fore.BLACK + Back.LIGHTMAGENTA_EX,
)

In [96]:
def index_color(index):
    return COLORS[index % len(COLORS)]

In [97]:
def reveal(text):
    pieces = (enc.decode([code], errors='strict') for code in enc.encode(text))
    pretty = (index_color(index) + piece for index, piece in enumerate(pieces))
    print(*pretty, sep='', end=(Style.RESET_ALL + '\n'))

In [98]:
reveal('altogether doctrinaire')

[1m[30m[102malto[1m[30m[105mgether[1m[30m[102m doctr[1m[30m[105minaire[0m


In [99]:
reveal('Dr. Von Squilldebrandt')

[1m[30m[102mDr[1m[30m[105m.[1m[30m[102m Von[1m[30m[105m Squ[1m[30m[102mill[1m[30m[105mde[1m[30m[102mbrand[1m[30m[105mt[0m


In [100]:
def count_tokens(text):
    return len(enc.encode(text))

In [101]:
texts

['That is a happy person',
 'That is a happy dog',
 'That is a very happy person',
 'Today is a sunny day']

In [102]:
sum(count_tokens(text) for text in texts)

21

In [103]:
count_tokens(tow) + count_tokens(tow_modified)

3114

In [104]:
reveal(tow)



In [105]:
with open('the_open_window.txt', encoding='utf-8') as file:
    reveal(file.read())

[1m[30m[102m"My[1m[30m[105m aunt[1m[30m[102m will[1m[30m[105m be[1m[30m[102m down[1m[30m[105m presently[1m[30m[102m,[1m[30m[105m Mr[1m[30m[102m.[1m[30m[105m Nut[1m[30m[102mtel[1m[30m[105m,"[1m[30m[102m said[1m[30m[105m a[1m[30m[102m very[1m[30m[105m
[1m[30m[102mself[1m[30m[105m-[1m[30m[102mposs[1m[30m[105messed[1m[30m[102m young[1m[30m[105m lady[1m[30m[102m of[1m[30m[105m fifteen[1m[30m[102m;[1m[30m[105m "[1m[30m[102min[1m[30m[105m the[1m[30m[102m meantime[1m[30m[105m you[1m[30m[102m must[1m[30m[105m try[1m[30m[102m
[1m[30m[105mand[1m[30m[102m put[1m[30m[105m up[1m[30m[102m with[1m[30m[105m me[1m[30m[102m."

[1m[30m[105mF[1m[30m[102mram[1m[30m[105mton[1m[30m[102m Nut[1m[30m[105mtel[1m[30m[102m ende[1m[30m[105mavored[1m[30m[102m to[1m[30m[105m say[1m[30m[102m the[1m[30m[105m correct[1m[30m[102m something[1m[30m[105m which[1m[30m[1