# Tiny Monte Carlo experiment on embedding similarity

SPDX-License-Identifier: 0BSD

This shows how embeddings of short English phrases are often much more similar
than randomly (uniformly) picked normalized vectors. I think this makes
intuitive sense, and isn't specific to OpenAI's models, but it's an interesting
effect nonetheless.

In [1]:
from math import sqrt

import numpy as np

In [2]:
def make_vector(dimension):
    vector = np.random.uniform(low=-1, high=1, size=(dimension,))
    vector /= np.linalg.norm(vector)
    return vector

In [3]:
def compute_test_similarities(*, dimension, count):
    iterator = (
        np.dot(make_vector(dimension), make_vector(dimension))
        for _ in range(count)
    )
    similarities = np.fromiter(iter=iterator, dtype=np.float32, count=count)
    return abs(similarities)

In [4]:
def test_dimension(dimension, *, count=1_000_000):
    sims = compute_test_similarities(dimension=dimension, count=count)
    print(f'{dimension=}, {1/sqrt(dimension)=}, {sims.mean()=}, {sims.var()=}')

In [5]:
test_dimension(300)

dimension=300, 1/sqrt(dimension)=0.05773502691896257, sims.mean()=0.0461321, sims.var()=0.001211917


In [6]:
test_dimension(1536)

dimension=1536, 1/sqrt(dimension)=0.025515518153991442, sims.mean()=0.020379607, sims.var()=0.00023664696
