In [3]:
# list all the packages mentioned in the code below
# !pip install fastapi pydantic uvicorn
# !pip install numpy pandas matplotlib seaborn scikit-learn sentence_transformers torch nltk annoy tqdm ipython jupyter

Collecting fastapi
  Using cached fastapi-0.110.3-py3-none-any.whl.metadata (24 kB)
Collecting pydantic
  Using cached pydantic-2.7.1-py3-none-any.whl.metadata (107 kB)
Collecting uvicorn
  Downloading uvicorn-0.29.0-py3-none-any.whl.metadata (6.3 kB)
Collecting starlette<0.38.0,>=0.37.2 (from fastapi)
  Using cached starlette-0.37.2-py3-none-any.whl.metadata (5.9 kB)
Collecting annotated-types>=0.4.0 (from pydantic)
  Using cached annotated_types-0.6.0-py3-none-any.whl.metadata (12 kB)
Collecting pydantic-core==2.18.2 (from pydantic)
  Using cached pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.5 kB)
Using cached fastapi-0.110.3-py3-none-any.whl (91 kB)
Using cached pydantic-2.7.1-py3-none-any.whl (409 kB)
Using cached pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl (1.8 MB)
Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m458.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01

In [1]:
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
import torch

device = torch.device("mps")



In [2]:
model = SentenceTransformer(
    'sentence-transformers/all-mpnet-base-v2',
    device=device
)

In [3]:
from tqdm.notebook import tqdm

In [4]:
import nltk

In [5]:
nltk.download('brown')


[nltk_data] Downloading package brown to /Users/igor/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [6]:
from nltk.corpus import wordnet as wn

In [7]:
from nltk.corpus import brown
import re
letters = re.compile(r'[a-z]', re.I)


def clear(s):
    s = s.lower()
    # cut 's off the end of words
    if s.endswith("'s"):
        s = s[:-2]

    # cut off punctuation from both ends using a regex
    s = re.sub(r'^[^a-z]+', '', s)
    s = re.sub(r'[^a-z]+$', '', s)
    
    return s


freqs = nltk.FreqDist(
    clear(w) for w in brown.words()
    if letters.match(w)
)

In [8]:
def vectorify(*strings: str):
    strings = list(strings)
    embeddings = model.encode(strings, convert_to_tensor=True)
    embeddings = F.normalize(embeddings, p=2, dim=1)
    embeddings = embeddings / torch.norm(embeddings, dim=1, keepdim=True)

    return [emb.cpu().numpy() for emb in embeddings]

vectorify("hello")[0].shape

(768,)

In [9]:
words = list(freqs.keys())

In [10]:
word2vec = {
    word: emb 
    for word, emb in zip(words, vectorify(*words))
}

In [31]:
# save the word2vec model
import pickle
with open("word2vec2.pkl", "wb") as f:
    pickle.dump(word2vec, f)

In [18]:
len(freqs)

45457

In [10]:
# load the word2vec model
import pickle
with open("word2vec.pkl", "rb") as f:
    word2vec = pickle.load(f)

In [11]:
from annoy import AnnoyIndex
import numpy as np

f = word2vec['pig'].shape[0]  # Dimension of each vector (768)
n_trees = 50  # More trees, more precision, more memory and build time

# Initialize Annoy Index
t = AnnoyIndex(f, 'angular')

In [12]:
# Add items to index

for i, (word, emb) in enumerate(tqdm(word2vec.items())):
    t.add_item(i, emb)

  0%|          | 0/45457 [00:00<?, ?it/s]

In [13]:
t.build(n_trees)

t.save('word2vec2.ann')

True

In [12]:
t.load('word2vec.ann')

True

In [14]:
word2index = {word: i for i, word in enumerate(word2vec.keys())}
index2word = {i: word for i, word in enumerate(word2vec.keys())}

In [15]:
from functools import lru_cache

def get_distance(vec1, vec2):
    return np.linalg.norm(vec1 - vec2)

@lru_cache(maxsize=256)
def get_coordinates(word):
    return word2vec[word] if word in word2vec else vectorify(word)[0]

def get_nearest_neighbors(word, n=10):
    i = get_coordinates(word) if isinstance(word, str) else word
    return [
        (
            index2word[j],
            d,
        )
        for j, d in zip(*t.get_nns_by_vector(i, n, include_distances=True))
    ]


In [16]:
import ipywidgets as widgets
from IPython.display import display, clear_output
from ipywidgets import interact, interactive, fixed

man = get_coordinates('man')
woman = get_coordinates('woman')

# get the line between those 2 points

def point_on_line(a, b, t):
    """
    Compute a point along the line between vectors a and b.
    
    Parameters:
    - a (np.array): The starting point vector.
    - b (np.array): The ending point vector.
    - t (float): Interpolation parameter (0 <= t <= 1).
                  t=0 returns a, t=1 returns b, and values in between return
                  points along the line between a and b.
    
    Returns:
    - np.array: A point along the line.
    """
    return a + t * (b - a)

def point_in_triangle(a, b, c, u, v, w):
    """
    Compute a point inside the triangle formed by vectors a, b, and c using normalized barycentric coordinates.
    
    Parameters:
    - a, b, c (np.array): The vertices of the triangle.
    - u, v, w (float): Initial barycentric coordinates, which will be normalized so that their sum is 1.
    
    Returns:
    - np.array: A point inside the triangle.
    """
    # Stack the vertices and the coordinates into arrays
    vertices = np.array([a, b, c])
    weights = np.array([u, v, w])
    
    # Normalize the barycentric coordinates
    weights_normalized = weights / weights.sum()
    
    # Compute the point in the triangle
    return np.dot(weights_normalized, vertices)

def f(words, n, masculinity=0.5, femininity=0.5, nonbinary=0.5):
    wrds = words.split()
    coordinates = [get_coordinates(word) for word in wrds]
    median = np.sum(coordinates, axis=0)
    gender = point_in_triangle(
        get_coordinates('man'),
        get_coordinates('woman'),
        get_coordinates('non-binary'),
        masculinity,
        femininity,
        nonbinary,
    )
    
    return get_nearest_neighbors(median + gender, n)

interact(
    f,
    words='love hate',
    n=widgets.IntSlider(min=1, max=100, value=10), 
    masculinity=widgets.FloatSlider(min=0, max=1, value=0.5),
    femininity=widgets.FloatSlider(min=0, max=1, value=0.5),
    nonbinary=widgets.FloatSlider(min=0, max=1, value=0.5),   
)
None

interactive(children=(Text(value='love hate', description='words'), IntSlider(value=10, description='n', min=1…

In [30]:
import ipywidgets as widgets
from IPython.display import display, clear_output
from ipywidgets import interact, interactive, fixed, interact_manual
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib
import numpy as np

def project_to_best_plane(points, n_components=2):
    """
    Project points onto the best fitting plane using PCA.

    Parameters:
    - points (np.array): A NumPy array with shape (n_samples, n_features)
                         where n_samples is the number of data points and
                         n_features is the dimensionality of each data point.

    Returns:
    - np.array: The projection of the points onto the best fitting plane.
    """
    # Initialize PCA with 2 components
    pca = PCA(n_components)
    
    # Fit PCA on the data and transform the data to the new axes
    projected_points = pca.fit_transform(points)
    
    return projected_points


def f(words, n):
    wrds = set(words.split())
    coordinates = [get_coordinates(w) for w in wrds]
    median = np.sum(coordinates, axis=0)
    neigbours: tuple[str, float] = get_nearest_neighbors(median, n)
    found = set(w for w, _ in neigbours)

    all_words: list[dict] = [
        {
            'word': w,
            'distance': d,
            'coordinates': get_coordinates(w),
        }
        for w, d in neigbours
    ] + [
        {
            'word': w,
            'distance': np.linalg.norm(c - median),
            'coordinates': c,
        }
        for w, c in zip(wrds, coordinates)
        if w not in found
    ]

    projected_points = project_to_best_plane(
        [word['coordinates'] for word in all_words], n_components=3
    )

    for i, word in enumerate(all_words):
        word['projected'] = projected_points[i]
    

    # plot the points, color is the distance from the median, alpha is the frequency from freqs dict

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    
    for word in all_words:
        word['freq'] = freqs.get(word['word'], 1)

    max_freq = max([word['freq'] for word in all_words])
    min_freq = min([word['freq'] for word in all_words])

    def normalize_freq(freq):
        return (freq - min_freq) / (max_freq - min_freq)
    
    for word in all_words:
        x, y, z = word['projected']
        color = normalize_freq(word['freq'])
        fr = np.log(word['freq']) * 10 + 1
        alpha = max(0.4, 1 - word['distance'])
        pallete = matplotlib.colormaps['viridis']
        ax.scatter(x, y, z, s=fr, alpha=alpha, label=word)
        ax.text(x, y, z, word['word'], fontsize=12, alpha=alpha)
    plt.show()

interact(
    f,
    words='love joy',
    n=widgets.IntSlider(min=2, max=100, value=10), 
)
None

interactive(children=(Text(value='love joy', description='words'), IntSlider(value=10, description='n', min=2)…

In [25]:
%matplotlib widget

In [38]:
!which python 

/Users/igor/.pyenv/versions/3.11.6/envs/tmp/bin/python


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [37]:
import platform
platform.system()

'Darwin'