# text-embeddings-ada-002 - via OpenAI Python library

SPDX-License-Identifier: 0BSD

In [1]:
import functools
import itertools

from colorama import Back, Fore, Style
import numpy as np
import openai
import tiktoken

In [2]:
openai.api_key_path = '.openai_key'

In [3]:
texts = [
    'That is a happy person',
    'That is a happy dog',
    'That is a very happy person',
    'Today is a sunny day',
]

In [4]:
results = openai.Embedding.create(
    input=texts,
    model='text-embedding-ada-002',
)

In [5]:
results.data[0].keys()

dict_keys(['object', 'index', 'embedding'])

In [6]:
[datum.object for datum in results.data]

['embedding', 'embedding', 'embedding', 'embedding']

In [7]:
[datum.index for datum in results.data]

[0, 1, 2, 3]

In [8]:
pairs = itertools.pairwise(datum.index for datum in results.data)
all(lhs < rhs for lhs, rhs in pairs)

True

In [9]:
v1, v2, v3, v4 = (
    np.array(datum.embedding, dtype=np.float32)
    for datum in results.data
)

In [10]:
[embedding.shape for embedding in (v1, v2, v3, v4)]

[(1536,), (1536,), (1536,), (1536,)]

In [11]:
[np.linalg.norm(embedding) for embedding in (v1, v2, v3, v4)]

[1.0, 0.99999994, 1.0000001, 1.0]

In [12]:
np.dot(v1, v2)

0.93006015

In [13]:
np.dot(v1, v3)

0.9834069

In [14]:
np.dot(v1, v4)

0.822598

In [15]:
with open('the_open_window.txt', encoding='utf-8') as file:
    tow = file.read().strip().replace('\n', ' ')

with open('the_open_window_modified.txt', encoding='utf-8') as file:
    tow_modified = file.read().strip().replace('\n', ' ')

assert tow != tow_modified

tow_results = openai.Embedding.create(
    input=[tow, tow_modified],
    model='text-embedding-ada-002',
)

In [16]:
[datum.object for datum in tow_results.data]

['embedding', 'embedding']

In [17]:
[datum.index for datum in tow_results.data]

[0, 1]

In [18]:
pairs = itertools.pairwise(datum.index for datum in tow_results.data)
all(lhs < rhs for lhs, rhs in pairs)

True

In [19]:
w1, w2 = (
    np.array(datum.embedding, dtype=np.float32)
    for datum in tow_results.data
)

In [20]:
(w1 == w2).all()

False

In [21]:
np.linalg.norm(w1 - w2)

0.08120534

In [22]:
np.dot(w1, w2)

0.99670285

In [23]:
results.usage

<OpenAIObject at 0x7f4a9b7b6b70> JSON: {
  "prompt_tokens": 21,
  "total_tokens": 21
}

In [24]:
tow_results.usage

<OpenAIObject at 0x7f4a9b7b4ef0> JSON: {
  "prompt_tokens": 3114,
  "total_tokens": 3114
}

In [25]:
enc = tiktoken.get_encoding('cl100k_base')

In [26]:
enc.encode('That is a very happy person')

[4897, 374, 264, 1633, 6380, 1732]

In [27]:
enc.encode('altogether doctrinaire')

[76777, 3522, 61990, 68976]

In [28]:
_COLORS = (
    Style.BRIGHT + Fore.BLACK + Back.LIGHTGREEN_EX,
    Style.BRIGHT + Fore.BLACK + Back.LIGHTMAGENTA_EX,
)

In [29]:
def _index_color(index):
    return _COLORS[index % len(_COLORS)]

In [30]:
def _do_reveal(enc, text):
    pieces = (enc.decode([code], errors='strict') for code in enc.encode(text))
    pretty = (_index_color(index) + piece for index, piece in enumerate(pieces))
    print(*pretty, sep='', end=(Style.RESET_ALL + '\n'))

In [31]:
def make_token_revealer(encoding_name):
    enc = tiktoken.get_encoding(encoding_name)
    return functools.partial(_do_reveal, enc)

In [32]:
reveal_cl100k_base = make_token_revealer('cl100k_base')
reveal_gpt2 = make_token_revealer('gpt2')

In [33]:
def reveal_both(text):
    reveal_cl100k_base(text)
    reveal_gpt2(text)

In [34]:
reveal_both('altogether doctrinaire')

[1m[30m[102malto[1m[30m[105mgether[1m[30m[102m doctr[1m[30m[105minaire[0m
[1m[30m[102malt[1m[30m[105mogether[1m[30m[102m doct[1m[30m[105mrina[1m[30m[102mire[0m


In [35]:
reveal_both('Dr. Von Squilldebrandt')

[1m[30m[102mDr[1m[30m[105m.[1m[30m[102m Von[1m[30m[105m Squ[1m[30m[102mill[1m[30m[105mde[1m[30m[102mbrand[1m[30m[105mt[0m
[1m[30m[102mDr[1m[30m[105m.[1m[30m[102m Von[1m[30m[105m Squ[1m[30m[102mill[1m[30m[105mde[1m[30m[102mbrand[1m[30m[105mt[0m


In [36]:
reveal_both('eberschrifvt')

[1m[30m[102me[1m[30m[105mbers[1m[30m[102mchr[1m[30m[105mif[1m[30m[102mvt[0m
[1m[30m[102me[1m[30m[105mbers[1m[30m[102mch[1m[30m[105mr[1m[30m[102mif[1m[30m[105mvt[0m


In [37]:
reveal_both('wavelet bah')

[1m[30m[102mwave[1m[30m[105mlet[1m[30m[102m bah[0m
[1m[30m[102mwave[1m[30m[105mlet[1m[30m[102m b[1m[30m[105mah[0m


In [38]:
reveal_both('Fourscore and seven years ago')

[1m[30m[102mF[1m[30m[105mours[1m[30m[102mcore[1m[30m[105m and[1m[30m[102m seven[1m[30m[105m years[1m[30m[102m ago[0m
[1m[30m[102mFour[1m[30m[105mscore[1m[30m[102m and[1m[30m[105m seven[1m[30m[102m years[1m[30m[105m ago[0m


In [39]:
reveal_both('Our fiendish foes followed in force, onagers a-ready.')

[1m[30m[102mOur[1m[30m[105m fi[1m[30m[102mend[1m[30m[105mish[1m[30m[102m foes[1m[30m[105m followed[1m[30m[102m in[1m[30m[105m force[1m[30m[102m,[1m[30m[105m on[1m[30m[102magers[1m[30m[105m a[1m[30m[102m-ready[1m[30m[105m.[0m
[1m[30m[102mOur[1m[30m[105m fi[1m[30m[102mendish[1m[30m[105m foes[1m[30m[102m followed[1m[30m[105m in[1m[30m[102m force[1m[30m[105m,[1m[30m[102m on[1m[30m[105magers[1m[30m[102m a[1m[30m[105m-[1m[30m[102mready[1m[30m[105m.[0m


In [40]:
reveal_both('"Squee!" said one mouse. "Squick!" said the other.')

[1m[30m[102m"S[1m[30m[105mquee[1m[30m[102m!"[1m[30m[105m said[1m[30m[102m one[1m[30m[105m mouse[1m[30m[102m.[1m[30m[105m "[1m[30m[102mS[1m[30m[105mquick[1m[30m[102m!"[1m[30m[105m said[1m[30m[102m the[1m[30m[105m other[1m[30m[102m.[0m
[1m[30m[102m"[1m[30m[105mSqu[1m[30m[102mee[1m[30m[105m!"[1m[30m[102m said[1m[30m[105m one[1m[30m[102m mouse[1m[30m[105m.[1m[30m[102m "[1m[30m[105mSqu[1m[30m[102mick[1m[30m[105m!"[1m[30m[102m said[1m[30m[105m the[1m[30m[102m other[1m[30m[105m.[0m


In [41]:
reveal_both('The butler subtly buttled with a glut of scuttlebutt cutlery.')

[1m[30m[102mThe[1m[30m[105m but[1m[30m[102mler[1m[30m[105m subtly[1m[30m[102m butt[1m[30m[105mled[1m[30m[102m with[1m[30m[105m a[1m[30m[102m glut[1m[30m[105m of[1m[30m[102m sc[1m[30m[105muttle[1m[30m[102mbutt[1m[30m[105m cut[1m[30m[102ml[1m[30m[105mery[1m[30m[102m.[0m
[1m[30m[102mThe[1m[30m[105m but[1m[30m[102mler[1m[30m[105m subtly[1m[30m[102m butt[1m[30m[105mled[1m[30m[102m with[1m[30m[105m a[1m[30m[102m glut[1m[30m[105m of[1m[30m[102m sc[1m[30m[105mut[1m[30m[102mtle[1m[30m[105mbutt[1m[30m[102m cut[1m[30m[105mler[1m[30m[102my[1m[30m[105m.[0m


In [42]:
reveal_both('Patients were assessed electroencephalographically.')

[1m[30m[102mPatients[1m[30m[105m were[1m[30m[102m assessed[1m[30m[105m electro[1m[30m[102mence[1m[30m[105mph[1m[30m[102mal[1m[30m[105mographically[1m[30m[102m.[0m
[1m[30m[102mPat[1m[30m[105mients[1m[30m[102m were[1m[30m[105m assessed[1m[30m[102m electro[1m[30m[105mence[1m[30m[102mphal[1m[30m[105mographically[1m[30m[102m.[0m


In [43]:
reveal_both('"Isobaric" means "having equal pressure."')

[1m[30m[102m"[1m[30m[105mIs[1m[30m[102mobar[1m[30m[105mic[1m[30m[102m"[1m[30m[105m means[1m[30m[102m "[1m[30m[105mhaving[1m[30m[102m equal[1m[30m[105m pressure[1m[30m[102m."[0m
[1m[30m[102m"[1m[30m[105mI[1m[30m[102mso[1m[30m[105mbar[1m[30m[102mic[1m[30m[105m"[1m[30m[102m means[1m[30m[105m "[1m[30m[102mhaving[1m[30m[105m equal[1m[30m[102m pressure[1m[30m[105m."[0m


In [44]:
reveal_both("I'm afraid we've sinned rather unforgivably, old chap.")

[1m[30m[102mI[1m[30m[105m'm[1m[30m[102m afraid[1m[30m[105m we[1m[30m[102m've[1m[30m[105m s[1m[30m[102minned[1m[30m[105m rather[1m[30m[102m unf[1m[30m[105morg[1m[30m[102miv[1m[30m[105mably[1m[30m[102m,[1m[30m[105m old[1m[30m[102m chap[1m[30m[105m.[0m
[1m[30m[102mI[1m[30m[105m'm[1m[30m[102m afraid[1m[30m[105m we[1m[30m[102m've[1m[30m[105m sin[1m[30m[102mned[1m[30m[105m rather[1m[30m[102m unfor[1m[30m[105mg[1m[30m[102mivably[1m[30m[105m,[1m[30m[102m old[1m[30m[105m chap[1m[30m[102m.[0m


In [45]:
reveal_both('In the Shakespearean tragedy *Othello*, Brabantio was the father of Desdemona.')

[1m[30m[102mIn[1m[30m[105m the[1m[30m[102m Shakespeare[1m[30m[105man[1m[30m[102m tragedy[1m[30m[105m *[1m[30m[102mO[1m[30m[105mth[1m[30m[102mello[1m[30m[105m*,[1m[30m[102m Br[1m[30m[105mabant[1m[30m[102mio[1m[30m[105m was[1m[30m[102m the[1m[30m[105m father[1m[30m[102m of[1m[30m[105m Des[1m[30m[102mdem[1m[30m[105mona[1m[30m[102m.[0m
[1m[30m[102mIn[1m[30m[105m the[1m[30m[102m Shakespeare[1m[30m[105man[1m[30m[102m tragedy[1m[30m[105m *[1m[30m[102mO[1m[30m[105mthe[1m[30m[102mllo[1m[30m[105m*,[1m[30m[102m Br[1m[30m[105mab[1m[30m[102mant[1m[30m[105mio[1m[30m[102m was[1m[30m[105m the[1m[30m[102m father[1m[30m[105m of[1m[30m[102m Des[1m[30m[105mdem[1m[30m[102mona[1m[30m[105m.[0m


In [46]:
reveal_both('The Slopes Bureau frowns on whalebone skis, for reasons of conservation.')

[1m[30m[102mThe[1m[30m[105m Slo[1m[30m[102mpes[1m[30m[105m Bureau[1m[30m[102m f[1m[30m[105mrow[1m[30m[102mns[1m[30m[105m on[1m[30m[102m whale[1m[30m[105mbone[1m[30m[102m sk[1m[30m[105mis[1m[30m[102m,[1m[30m[105m for[1m[30m[102m reasons[1m[30m[105m of[1m[30m[102m conservation[1m[30m[105m.[0m
[1m[30m[102mThe[1m[30m[105m Sl[1m[30m[102mopes[1m[30m[105m Bureau[1m[30m[102m frown[1m[30m[105ms[1m[30m[102m on[1m[30m[105m whale[1m[30m[102mbone[1m[30m[105m sk[1m[30m[102mis[1m[30m[105m,[1m[30m[102m for[1m[30m[105m reasons[1m[30m[102m of[1m[30m[105m conservation[1m[30m[102m.[0m


In [47]:
reveal_both("The ski instructor upbraided the novice for attempting a black diamond, which was beyond the student's capabilities and also triggered an unfortunate avalanche.")

[1m[30m[102mThe[1m[30m[105m ski[1m[30m[102m instructor[1m[30m[105m up[1m[30m[102mbra[1m[30m[105mided[1m[30m[102m the[1m[30m[105m novice[1m[30m[102m for[1m[30m[105m attempting[1m[30m[102m a[1m[30m[105m black[1m[30m[102m diamond[1m[30m[105m,[1m[30m[102m which[1m[30m[105m was[1m[30m[102m beyond[1m[30m[105m the[1m[30m[102m student[1m[30m[105m's[1m[30m[102m capabilities[1m[30m[105m and[1m[30m[102m also[1m[30m[105m triggered[1m[30m[102m an[1m[30m[105m unfortunate[1m[30m[102m avalanche[1m[30m[105m.[0m
[1m[30m[102mThe[1m[30m[105m ski[1m[30m[102m instructor[1m[30m[105m up[1m[30m[102mbra[1m[30m[105mided[1m[30m[102m the[1m[30m[105m novice[1m[30m[102m for[1m[30m[105m attempting[1m[30m[102m a[1m[30m[105m black[1m[30m[102m diamond[1m[30m[105m,[1m[30m[102m which[1m[30m[105m was[1m[30m[102m beyond[1m[30m[105m the[1m[30m[102m student[1m[30m[105m's[1m[30m[10

In [48]:
reveal_both('Alpha Centauri is the nearest star to our sun.')

[1m[30m[102mAlpha[1m[30m[105m Cent[1m[30m[102maur[1m[30m[105mi[1m[30m[102m is[1m[30m[105m the[1m[30m[102m nearest[1m[30m[105m star[1m[30m[102m to[1m[30m[105m our[1m[30m[102m sun[1m[30m[105m.[0m
[1m[30m[102mAlpha[1m[30m[105m Centauri[1m[30m[102m is[1m[30m[105m the[1m[30m[102m nearest[1m[30m[105m star[1m[30m[102m to[1m[30m[105m our[1m[30m[102m sun[1m[30m[105m.[0m


In [49]:
reveal_both('Lana left the sonic harmonica on the Encyclopaedia Daemonica.')

[1m[30m[102mL[1m[30m[105mana[1m[30m[102m left[1m[30m[105m the[1m[30m[102m sonic[1m[30m[105m harmon[1m[30m[102mica[1m[30m[105m on[1m[30m[102m the[1m[30m[105m Enc[1m[30m[102myc[1m[30m[105mlo[1m[30m[102mpa[1m[30m[105media[1m[30m[102m Daemon[1m[30m[105mica[1m[30m[102m.[0m
[1m[30m[102mL[1m[30m[105mana[1m[30m[102m left[1m[30m[105m the[1m[30m[102m sonic[1m[30m[105m harmon[1m[30m[102mica[1m[30m[105m on[1m[30m[102m the[1m[30m[105m En[1m[30m[102mcyclop[1m[30m[105ma[1m[30m[102media[1m[30m[105m Da[1m[30m[102memon[1m[30m[105mica[1m[30m[102m.[0m


In [50]:
reveal_both('dtunqhoepgfxvzcwlsajkmbiry')

[1m[30m[102mdt[1m[30m[105mun[1m[30m[102mq[1m[30m[105mho[1m[30m[102mep[1m[30m[105mgfx[1m[30m[102mv[1m[30m[105mzc[1m[30m[102mw[1m[30m[105mls[1m[30m[102maj[1m[30m[105mk[1m[30m[102mmb[1m[30m[105miry[0m
[1m[30m[102md[1m[30m[105mtun[1m[30m[102mq[1m[30m[105mho[1m[30m[102mep[1m[30m[105mg[1m[30m[102mfx[1m[30m[105mv[1m[30m[102mz[1m[30m[105mc[1m[30m[102mw[1m[30m[105mls[1m[30m[102maj[1m[30m[105mk[1m[30m[102mmb[1m[30m[105miry[0m


In [51]:
def count_tokens(text):
    return len(enc.encode(text))

In [52]:
texts

['That is a happy person',
 'That is a happy dog',
 'That is a very happy person',
 'Today is a sunny day']

In [53]:
sum(count_tokens(text) for text in texts)

21

In [54]:
count_tokens(tow) + count_tokens(tow_modified)

3114

In [55]:
reveal_cl100k_base(tow)



In [56]:
reveal_gpt2(tow)



In [57]:
with open('the_open_window.txt', encoding='utf-8') as file:
    tow_nl = file.read()

In [58]:
paragraphs = (
    raw_graf.replace('\n', ' ')
    for raw_graf in tow_nl.split('\n\n')
)

for graf in paragraphs:
    print()
    reveal_cl100k_base(graf)
    print()
    reveal_gpt2(graf)
    print()


[1m[30m[102m"My[1m[30m[105m aunt[1m[30m[102m will[1m[30m[105m be[1m[30m[102m down[1m[30m[105m presently[1m[30m[102m,[1m[30m[105m Mr[1m[30m[102m.[1m[30m[105m Nut[1m[30m[102mtel[1m[30m[105m,"[1m[30m[102m said[1m[30m[105m a[1m[30m[102m very[1m[30m[105m self[1m[30m[102m-[1m[30m[105mposs[1m[30m[102messed[1m[30m[105m young[1m[30m[102m lady[1m[30m[105m of[1m[30m[102m fifteen[1m[30m[105m;[1m[30m[102m "[1m[30m[105min[1m[30m[102m the[1m[30m[105m meantime[1m[30m[102m you[1m[30m[105m must[1m[30m[102m try[1m[30m[105m and[1m[30m[102m put[1m[30m[105m up[1m[30m[102m with[1m[30m[105m me[1m[30m[102m."[0m

[1m[30m[102m"[1m[30m[105mMy[1m[30m[102m aunt[1m[30m[105m will[1m[30m[102m be[1m[30m[105m down[1m[30m[102m presently[1m[30m[105m,[1m[30m[102m Mr[1m[30m[105m.[1m[30m[102m Nut[1m[30m[105mtel[1m[30m[102m,"[1m[30m[105m said[1m[30m[102m a[1m[30m[105m