In [1]:
import sys
#!{sys.executable} -m ********
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from pympler import asizeof
from nltk.corpus import words

# Load list of Webster Dictionary Words

In [2]:
word_list = words.words()
word_list = [i.lower() for i in word_list]
print(len(word_list))  # prints the number of words in the corpus
# Ensure word_list is a set (if not already)
word_list = set(word_list)

236736


# Load Google News Trained Word2Vec Embedding, then Filter to only contain Dictionary Words
model: KeyedVector object of embeddings

model_np: Numpy array of embeddings

norm_model_np: Numpy array of embeddings scaled to unit length

In [3]:
# Replace 'path/to/embeddings.bin' with the actual path to your .bin file.
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
words_in_model = [word for word in model.key_to_index if word.lower() in word_list]
#index_in_model = [model.key_to_index[i] for i in words_in_model]
model = model[words_in_model]

# Create a new KeyedVectors instance with the same vector size
new_model = KeyedVectors(vector_size=300)
# Assign the keys (vocabulary) in order
new_model.index_to_key = words_in_model
# Assign the vectors (embedding matrix)
new_model.vectors = model
# Rebuild the key-to-index dictionary
new_model.key_to_index = {word: idx for idx, word in enumerate(words_in_model)}
del(model)
model = new_model
model_np = model.vectors.T
norms = np.linalg.norm(model_np, axis = 0)
norm_model_np = model_np / norms

# To get the vector for a specific word:
vector = model['word']  # e.g., model['king']

# To see the most similar words:
similar_words = model.most_similar('king')
print(similar_words)


[('queen', 0.6510956883430481), ('monarch', 0.6413194537162781), ('prince', 0.6159993410110474), ('sultan', 0.5864824056625366), ('ruler', 0.5797567367553711), ('throne', 0.5422105193138123), ('royal', 0.5239794254302979), ('kingdom', 0.5210405588150024), ('princess', 0.5161998867988586), ('King', 0.5158917903900146)]


# View Size of Embeddings
1 word is 300 $\times$ 1 vector

Numpy array is 301MB

In [4]:
print(model_np.shape, "model_np shape")
print(asizeof.asizeof(model), "model size")
print(asizeof.asizeof(model_np), "model_np size")
print(sys.getsizeof(norm_model_np), 'norm_model_np size')

(300, 125754) model_np shape
166814032 model size
301809872 model_np size
150904928 norm_model_np size


# Random Selection of Words to Test

In [9]:
words = ['loud','rhythm','recovery','tolerate','zero','accountant','club','form','tablet','bomber','row','lost','coin','wonder','crack','snap','reduce','hostage','season','loss','echo','interest','reform','faint','talkative','traction','slump','fine']

In [None]:
# Usage of model:

model[key] -> key = 'word'
model_np[:, model.key_to_index[key]] -> key = 'word'
model_np[:, index] -> model.index_to_key[index] = 'word'

In [10]:
for i in words:
    print(i)
    print(np.linalg.norm(model[i]))
    print(np.linalg.norm(model_np[:,model.key_to_index[i]]))
    print(np.linalg.norm(norm_model_np[:,model.key_to_index[i]]))

loud
2.9796681
2.9796681
0.99999994
rhythm
3.5081346
3.5081346
1.0
recovery
2.9200957
2.9200957
1.0
tolerate
3.219292
3.219292
1.0
zero
2.7152338
2.7152338
1.0
accountant
3.2057168
3.2057168
0.99999994
club
2.5405009
2.5405009
0.99999994
form
2.129578
2.129578
1.0
tablet
3.5970337
3.5970337
1.0
bomber
4.021521
4.021521
1.0
row
2.6076393
2.6076393
0.99999994
lost
2.5961642
2.5961642
1.0
coin
3.2794971
3.2794971
1.0
wonder
2.5662866
2.5662866
1.0
crack
2.9027128
2.9027128
0.99999994
snap
2.795854
2.795854
1.0
reduce
2.8118677
2.8118677
0.99999994
hostage
3.9402144
3.9402144
0.99999994
season
2.5808203
2.5808203
1.0
loss
2.8209555
2.8209555
0.99999994
echo
2.7552948
2.7552948
1.0
interest
2.5428932
2.5428932
1.0
reform
3.0955439
3.0955439
1.0
faint
2.9436555
2.9436555
1.0
talkative
3.370544
3.370544
0.99999994
traction
3.2631102
3.2631102
1.0
slump
3.642595
3.642595
1.0
fine
2.6069028
2.6069028
0.9999999


# Cosine Similarity Function

In [None]:
def cosine(x,y):
    x_norm = np.sqrt(x.T @ x)
    y_norm = np.sqrt(y.T @ y)
    cross = x.T @ y
    cos = cross / (x_norm * y_norm)
    return cos

## Find Subspace Spanned by two Word Vectors, then Find Vectors with Lowest Distance to the Subspace

In [None]:
meat_bread = model_np[:, [model.key_to_index['meat'], model.key_to_index['bread']]]
proj = meat_bread @ np.linalg.inv(meat_bread.T @ meat_bread) @ meat_bread.T
null_proj = np.eye(300) - proj
smallest_dist = [1000, 1000, 1000]
smallest_dist_word = [0, 0, 0]
for i in range(200):
    vecs = model_np[:, (i*100):((i+1)*100)]
    
    dists = np.diag(vecs.T @ (null_proj) @ vecs)
    diag_sort = np.argsort(dists)
    print(model.index_to_key[(i*100):((i+1)*100)][0:10])
    print(diag_sort[0:10])
    #global_indices = np.array(model.index_to_key[(i*100):((i+1)*100)])[diag_sort]
    global_indices = np.arange((i*100), ((i+1)*100))
    
#    print(f"Smallest: {model.index_to_key[global_indices[0]]} = {dists[diag_sort[0]]}, {model.index_to_key[global_indices[1]]} = {dists[diag_sort[1]]}, {model.index_to_key[global_indices[2]]} = {dists[diag_sort[2]]}")
#    print(f"Furthest: {model.index_to_key[global_indices[-1]]} = {dists[diag_sort[-1]]}")
    if dists[diag_sort[0]] < smallest_dist[0]:
        smallest_dist[2] = smallest_dist[1]
        smallest_dist[1] = smallest_dist[0]
        smallest_dist[0] = dists[diag_sort[0]]
        smallest_dist_word[2] = smallest_dist_word[1]
        smallest_dist_word[1] = smallest_dist_word[0]
        smallest_dist_word[0] = model.index_to_key[global_indices[0]]

smallest_dist_word

### Same ish as before but with normalized vectors

In [None]:
meat_bread = norm_model_np[:, [model.key_to_index['add'], model.key_to_index['subtract']]]
proj = meat_bread @ np.linalg.inv(meat_bread.T @ meat_bread) @ meat_bread.T
null_proj = np.eye(300) - proj
smallest_dist = [1000, 1000, 1000]
smallest_dist_word = [0, 0, 0]
for i in range(30000):
    vecs = norm_model_np[:, (i*100):((i+1)*100)]
    
    dists = np.diag(vecs.T @ (null_proj) @ vecs)
    diag_sort = np.argsort(dists)
    global_indices = np.arange((i*100), ((i+1)*100))
    
    if dists[diag_sort[0]] < smallest_dist[0]:
        smallest_dist[2] = smallest_dist[1]
        smallest_dist[1] = smallest_dist[0]
        smallest_dist[0] = dists[diag_sort[0]]
        smallest_dist_word[2] = smallest_dist_word[1]
        smallest_dist_word[1] = smallest_dist_word[0]
        smallest_dist_word[0] = model.index_to_key[global_indices[0]]

smallest_dist_word

In [None]:
word_index = model.key_to_index['fruit']
word = 'fruit'
smallest_cos = 000
smallest_dist = 000
smallest_dist_norm = 000
word_cos = 'a'
word_dist = 'a'
word_dist_norm = 'a'
for i in range(model_np.shape[1]):
    cos = np.abs(cosine(model_np[:,word_index], model_np[:,i]))
#    cos = np.abs(model.similarity(word, i))
    dist = np.sum(model_np[:, word_index] - model_np[:,i])
    dist_norm = np.sum(norm_model_np[:, word_index] - norm_model_np[:,i])
    if cos > smallest_cos:
        smallest_cos = cos
        word_cos = model.index_to_key[i]
    if dist > smallest_dist:
        smallest_dist = dist
        word_dist = model.index_to_key[i]
    if dist_norm > smallest_dist_norm:
        smallest_dist_norm = dist_norm
        word_dist_norm = model.index_to_key[i]

print(word_cos, smallest_cos)
print(word_dist)
print(word_dist_norm)

### Same ish as before with cosine similarity instead of distance

In [92]:
meat_bread = norm_model_np[:, [model.key_to_index['meat'], model.key_to_index['bread']]]
proj = meat_bread @ np.linalg.inv(meat_bread.T @ meat_bread) @ meat_bread.T
min_cos = [0, 0, 0]
min_cos_word = [0, 0, 0]
for i in range(1200):
    index_range = np.arange((i*100), ((i+1)*100))
    vecs = norm_model_np[:, index_range]
    vec_proj = proj @ vecs
    norm_vec_proj = vec_proj / np.linalg.norm(vec_proj, axis = 0)

    cosines = np.diag(norm_vec_proj.T @ vecs)
    cosine_sort_order = np.argsort(cosines)
    global_sort_order = index_range[cosine_sort_order]

    if cosines[cosine_sort_order[-1]] > min_cos[0]:
        min_cos[2] = min_cos[1]
        min_cos[1] = min_cos[0]
        min_cos[0] = cosines[cosine_sort_order[-1]]
        min_cos_word[2] = min_cos_word[1]
        min_cos_word[1] = min_cos_word[0]
        min_cos_word[0] = model.index_to_key[global_sort_order[-1]]
min_cos_word

['bread', 'meat', 'food']

In [114]:
meat_bread = norm_model_np[:, [model.key_to_index['cat'], model.key_to_index['dog']]]
proj = meat_bread @ np.linalg.inv(meat_bread.T @ meat_bread) @ meat_bread.T
vec_proj = proj @ norm_model_np
vec_proj = vec_proj / np.linalg.norm(vec_proj, axis = 0)
cosines = np.sum(vec_proj * norm_model_np, axis=0)
cosine_sort_order = np.argsort(cosines)
print(cosine_sort_order[-10:])
for i in range(20):
    print(model.index_to_key[cosine_sort_order[-(1+i)]], end = "; ")

[40476  3449 16254 19444 20452 28394 10033  2105  1593  4142]
cat; dog; dogs; puppy; beagle; pooch; pup; kitten; pet; chihuahua; Pomeranian; feline; dachshund; schnauzer; poodle; Dog; pug; canine; collie; terrier; 

### I think this jawn is the same ish as before, aka find subspace, project each vector on subspace, then find cosine similarity between original vector and its projection, but as a function. I also tested a couple scoring functions.

In [159]:
def plane_similarity(w1, w2, alpha):
    M = norm_model_np[:, [model.key_to_index[w1], model.key_to_index[w2]]]
    P = M @ np.linalg.inv(M.T @ M) @ M.T
    P_corpus = P @ norm_model_np
    P_norm_corpus = P_corpus / np.linalg.norm(P_corpus, axis = 0)
    cosines = np.sum(P_norm_corpus * norm_model_np, axis = 0)
    cosines_sort_order = np.argsort(cosines)
    w1_cosines = np.sum(norm_model_np * M[:,[0]], axis = 0)
    w2_cosines = np.sum(norm_model_np * M[:,[1]], axis = 0)
    cosines_squ_w1_w2 = cosines**2 / (w1_cosines * w2_cosines)
    cosines_squ_w1_w2_sort_order = np.argsort(cosines_squ_w1_w2)

    score = cosines - alpha * ((w1_cosines + w2_cosines) / 2)
    score_sort_order = np.argsort(score)
    for i in range(20):
        regular_word = model.index_to_key[cosines_sort_order[-(i+1)]]
        scaled_word  = model.index_to_key[cosines_squ_w1_w2_sort_order[-(i+1)]]
        score_word   = model.index_to_key[score_sort_order[-(i+1)]]
        jos = [cosines[cosines_sort_order[-(i+1)]], w1_cosines[cosines_sort_order[-(i+1)]], w2_cosines[cosines_sort_order[-(i+1)]], cosines_squ_w1_w2[cosines_squ_w1_w2_sort_order[-(i+1)]]]
        
        print(f"Regular: {regular_word:<20}  Scaled: {scaled_word:<20}  Score: {score_word:<20}", jos)


In [160]:
plane_similarity('sword', 'armor', 0.7)

Regular: armor                 Scaled: minimize              Score: armor                [1.0, 0.3517392, 1.0, 28903.34]
Regular: sword                 Scaled: ned                   Score: sword                [0.9999998, 1.0, 0.3517392, 21909.463]
Regular: broadsword            Scaled: spec                  Score: knife                [0.66471547, 0.65937245, 0.31066358, 9393.199]
Regular: scimitar              Scaled: Trisul                Score: broadsword           [0.61209047, 0.60222614, 0.314278, 9392.594]
Regular: weaponry              Scaled: gombeen               Score: machete              [0.5976066, 0.3583247, 0.5737396, 7923.0474]
Regular: rapier                Scaled: Cowling               Score: BUNA                 [0.59382147, 0.57975984, 0.32417756, 6835.2334]
Regular: buckler               Scaled: Sheltered             Score: OFO                  [0.5895754, 0.5229112, 0.43885458, 5571.3423]
Regular: breastplate           Scaled: unexercised           Score: SULA   

# Find Mean Vector between two Words, then Find Third Vector that is Most Similar to the Mean Vec

In [168]:
def angle_similarity(w1, w2):
    v1 = norm_model_np[:, [model.key_to_index[w1]]]
    v2 = norm_model_np[:, [model.key_to_index[w2]]]
    mean_vec = (v1 + v2)/2
    print(v1.shape)
    cosines = np.sum(norm_model_np * mean_vec, axis = 0)
    cosines_sort_order = np.argsort(cosines)
    for i in range(20):
        word = model.index_to_key[cosines_sort_order[-(i+1)]]
        cos = cosines[cosines_sort_order[-(i+1)]]
        print(f"{word}: {cos}")
angle_similarity('joke', 'ohio')

(300, 1)
joke: 0.5641732215881348
ohio: 0.5641732215881348
michigan: 0.4537144601345062
tracy: 0.4050638973712921
utah: 0.4015306830406189
iowa: 0.3954731822013855
florida: 0.393928200006485
indiana: 0.3905448913574219
charlie: 0.3876674771308899
funny: 0.385683536529541
jared: 0.3853279948234558
missouri: 0.3847235441207886
alabama: 0.3835403621196747
pete: 0.3834690451622009
sarah: 0.38255104422569275
arkansas: 0.38168781995773315
scarry: 0.38066619634628296
donovan: 0.3804360032081604
moron: 0.3803333640098572
thats: 0.3787262439727783


In [164]:
jo = np.random.randint(0,20, (5,5))
po = np.random.randint(0,20,(5,1))
print(jo)
print(po)

[[ 0 13  8  6 10]
 [ 2  6  4  9 12]
 [ 0 10  5 19  2]
 [ 6  8 13 11  3]
 [18  6  3  0  4]]
[[12]
 [ 9]
 [18]
 [10]
 [ 3]]


In [174]:
lens = np.sqrt(np.sum(model_np * model_np, axis = 0))
lens.sort()

In [175]:
lens

array([0.28217894, 0.3433174 , 0.43042833, ..., 8.988994  , 9.110525  ,
       9.4089775 ], dtype=float32)