# Word2Vec Visualization

This notebook visualizes Word2Vec embeddings and their properties.

CBOW → smooth context, good for frequent words <br>
SGNS → better for rare words & topic cluster sharpness

In [None]:
import sys
import os
sys.path.append("/Users/Patron/github-projects/embeddings")
from pretraining.word2vec.skipgram import SkipGramSGNS

model = SkipGramSGNS.load('../../results/skipgram')

In [23]:
import sys
import os
sys.path.append("/Users/Patron/github-projects/embeddings")
from pretraining.word2vec.cbow import CBOWSGNS
from pretraining.word2vec.skipgram import SkipGramSGNS
skipgram_model = SkipGramSGNS.load('../../results/skipgram')
cbow_model = CBOWSGNS.load('../../results/cbow')

def category_top_nearest(model, anchor, topn=10):
    M = (model.W_in + model.W_out) / 2
    M = M / (np.linalg.norm(M,axis=1,keepdims=True)+1e-9)
    v = model.W_in[model.word2id[anchor]]
    v = v / (np.linalg.norm(v)+1e-9)
    sims = M @ v
    order = np.argsort(-sims)
    return [model.id2word[i] for i in order[:topn]]

stable = [
    "king","queen","prince","emperor",
    "france","germany","england","russia",
    "christianity","islam","buddhism","catholic",
    "army","navy","troops","battle",
]

for w in stable:
    print("SKIPGRAM TOP:", w, category_top_nearest(skipgram_model,w,10))
    print("CBOW TOP    :", w, category_top_nearest(cbow_model,w,10))
    print()


SKIPGRAM TOP: king ['ansel', 'jacaranda', 'nazwy', 'chanda', 'fforde', 'king', 'guerin', 'arrondissement', 'pinakothek', 'jeziora']
CBOW TOP    : king ['nathaniel', 'andre', 'comte', 'algeria', 'anchor', 'christie', 'asteroid', 'rand', 'businessman', 'huxley']

SKIPGRAM TOP: queen ['ansel', 'fforde', 'nazwy', 'puntarenas', 'pinakothek', 'guerin', 'jacaranda', 'arrondissement', 'macleish', 'farquhar']
CBOW TOP    : queen ['nathaniel', 'comte', 'andre', 'rand', 'algeria', 'anchor', 'willard', 'huxley', 'christie', 'asteroid']

SKIPGRAM TOP: prince ['ansel', 'puntarenas', 'fforde', 'nazwy', 'cisalpine', 'macleish', 'ergine', 'parana', 'schwarzenberg', 'modernisme']
CBOW TOP    : prince ['nathaniel', 'comte', 'andre', 'rand', 'algeria', 'anchor', 'willard', 'christie', 'asteroid', 'huxley']

SKIPGRAM TOP: emperor ['roman', 'brigantes', 'puntarenas', 'lacedaemon', 'punning', 'beholden', 'ansel', 'tigran', 'jacaranda', 'ojos']
CBOW TOP    : emperor ['nathaniel', 'comte', 'andre', 'algeria', 

In [6]:
def most_similar(model, word, topn=10):
    import numpy as np
    vecs = (model.W_in + model.W_out) / 2.0
    v = vecs[model.word2id[word]]
    sims = vecs @ v / (np.linalg.norm(vecs,axis=1)*np.linalg.norm(v)+1e-9)
    order = np.argsort(-sims)
    return [(model.id2word[i], float(sims[i])) for i in order[1:topn+1]]


In [7]:
print(most_similar(model, "war"))

[('forces', 0.9961369633674622), ('army', 0.995850682258606), ('ii', 0.9958368539810181), ('nazi', 0.9958155155181885), ('germany', 0.9957225322723389), ('allied', 0.9953802227973938), ('troops', 0.9953007698059082), ('outbreak', 0.9952908158302307), ('fought', 0.9951883554458618), ('battle', 0.9949401021003723)]


In [8]:
print(most_similar(model, "king"))
print(most_similar(model, "paris"))


[('queen', 0.9977585077285767), ('son', 0.9977362751960754), ('prince', 0.9971786737442017), ('daughter', 0.9970829486846924), ('pope', 0.9969437122344971), ('succeeded', 0.9969127774238586), ('reign', 0.9968228936195374), ('emperor', 0.9967514276504517), ('henry', 0.996737003326416), ('brother', 0.996713399887085)]
[('berlin', 0.9994198083877563), ('vienna', 0.9992685317993164), ('lincoln', 0.9992021918296814), ('florence', 0.9991916418075562), ('milan', 0.9991791844367981), ('moscow', 0.9991526007652283), ('clown', 0.9991336464881897), ('abraham', 0.999121904373169), ('munich', 0.9991217255592346), ('oswald', 0.9991183876991272)]


In [9]:
import numpy as np
def cosine(a,b): 
    return float(a @ b / (np.linalg.norm(a)*np.linalg.norm(b)))

pairs = [
    ("king","queen"),
    ("king","man"),
    ("king","woman"),
    ("king","dog"),
    ("paris","france"),
    ("paris","london"),
    ("cat","dog")
]

for w1, w2 in pairs:
    if w1 in model.word2id and w2 in model.word2id:
        v1 = model.W_in[model.word2id[w1]]
        v2 = model.W_in[model.word2id[w2]]
        print(w1, w2, cosine(v1,v2))


king queen 0.8503580093383789
king man 0.36603087186813354
king woman 0.28577324748039246
king dog 0.3268983066082001
paris france 0.4163320064544678
paris london 0.7193044424057007
cat dog 0.5459689497947693


In [None]:
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt

words = ["king","queen","man","woman","paris","france","london","england","cat","dog","computer","laptop"]
vecs = np.array([model.W_in[model.word2id[w]] for w in words if w in model.word2id])

tsne = TSNE(n_components=2, perplexity=8, random_state=42)
xy = tsne.fit_transform(vecs)

plt.figure(figsize=(8,6))
for i,w in enumerate(words):
    if w in model.word2id:
        plt.scatter(xy[i,0],xy[i,1])
        plt.text(xy[i,0]+0.01, xy[i,1]+0.01, w)
plt.title("TSNE sanity cluster check")
plt.show()


In [None]:
import sys
import os
sys.path.append("/Users/Patron/github-projects/embeddings")
from pretraining.utils.benchmark_datasets import EmbeddingBenchmarkDatasets

In [None]:
import numpy as np

def prepare_matrix_for_eval(model, which="combined"):
    M = (model.W_in + model.W_out) / 2.0 if which=="combined" else \
        (model.W_in if which=="in" else model.W_out)
    M = M.copy()
    # mean-center per dimension
    M -= M.mean(axis=0, keepdims=True)
    # row-normalize
    M /= (np.linalg.norm(M, axis=1, keepdims=True) + 1e-9)
    return M

M = prepare_matrix_for_eval(model, which="combined")
w2i = model.word2id; i2w = model.id2word

def most_similar_prepared(M, w, topn=10):
    v = M[w2i[w]]
    sims = M @ v
    order = np.argsort(-sims)
    return [(i2w[i], float(sims[i])) for i in order[1:topn+1]]


In [21]:
semantic_tests = [
    ("king","queen","man","woman"),
    ("paris","france","london","england"),
    ("france","paris","germany","berlin"),
    ("apple","fruit","carrot","vegetable"),
    ("car","automobile","plane","aircraft")
]
for a,b,c,d in semantic_tests:
    print(a,b,c,d)
    print(analogy_3cosadd(model,a,b,c)[:5])
    print()


king queen man woman
[('man', 0.9958233833312988), ('woman', 0.9956870079040527), ('money', 0.9952985644340515), ('loved', 0.9950323104858398), ('herself', 0.9949503540992737)]

paris france london england
[('france', 0.9984983801841736), ('netherlands', 0.9984973669052124), ('britain', 0.9980919361114502), ('spain', 0.9980418682098389), ('italy', 0.9979803562164307)]

france paris germany berlin
[('paris', 0.9984679818153381), ('berlin', 0.9980963468551636), ('reconstruction', 0.9980744123458862), ('moscow', 0.9979672431945801), ('franco', 0.9979330897331238)]

apple fruit carrot vegetable
[('albinoni', -0.37984156608581543), ('sauna', -0.3893294334411621), ('births', -0.3918601870536804), ('fforde', -0.3957656919956207), ('nazwy', -0.4002169072628021)]

car automobile plane aircraft
[('plane', 0.9978063106536865), ('algebra', 0.9970730543136597), ('linear', 0.9970618486404419), ('finite', 0.997042179107666), ('dimensional', 0.9970002174377441)]



In [20]:
import numpy as np

def analogy_3cosadd(model, a,b,c, topn=5):
    M = (model.W_in + model.W_out) / 2.0
    w2i = model.word2id
    
    v = M[w2i[b]] - M[w2i[a]] + M[w2i[c]]
    
    # normalize BOTH properly
    M_norm = M / (np.linalg.norm(M, axis=1, keepdims=True) + 1e-9)
    v = v / (np.linalg.norm(v) + 1e-9)
    
    sims = M_norm @ v
    order = np.argsort(-sims)
    return [(model.id2word[i], float(sims[i])) for i in order[:topn]]

tests = [
    ("run","running","swim","swimming"),
    ("walk","walking","talk","talking"),
    ("nation","nations","state","states"),
    ("program","programs","computer","computers"),
    ("day","days","year","years"),
    ("cat","cats","dog","dogs"),
]
for a,b,c,d in tests:
    print(f"{a}:{b} :: {c}:?")
    print(analogy_3cosadd(model,a,b,c,topn=5))
    pred = analogy_3cosadd(model,a,b,c,topn=3)[0][0]
    print(f"{a}:{b} :: {c}:{d} | predicted={pred} | correct={pred==d}")
    print()


run:running :: swim:?
[('lice', 0.9815119504928589), ('story', 0.9814983606338501), ('coates', 0.9812896251678467), ('quasar', 0.9811702370643616), ('cowardice', 0.9811632633209229)]
run:running :: swim:swimming | predicted=lice | correct=False

walk:walking :: talk:?
[('talk', 0.9989298582077026), ('cinema', 0.9985308647155762), ('guy', 0.9985108971595764), ('mad', 0.998444139957428), ('wise', 0.9984428286552429)]
walk:walking :: talk:talking | predicted=talk | correct=False

nation:nations :: state:?
[('state', 0.9992780089378357), ('federal', 0.9980740547180176), ('security', 0.9974332451820374), ('nations', 0.9974052906036377), ('administration', 0.9972826838493347)]
nation:nations :: state:states | predicted=state | correct=False

program:programs :: computer:?
[('computer', 0.99893718957901), ('software', 0.9968540668487549), ('systems', 0.9964869022369385), ('applications', 0.9959800243377686), ('digital', 0.9958880543708801)]
program:programs :: computer:computers | predicted=c

In [None]:
probe = ["smaller","shorter","slower","talking","swimming",
         "years","states","computers"]
for w in probe:
    print(w, w in model.word2id)


In [11]:
def vocab_check(words):
    return {w: (w in model.word2id) for w in words}

print(vocab_check(["big","bigger","small","smaller"]))
print(vocab_check(["long","longer","short","shorter"]))
print(vocab_check(["quick","quicker","slow","slower"]))
print(vocab_check(["run","running","play","playing"]))
print(vocab_check(["walk","walking","swim","swimming"]))
print(vocab_check(["day","days","year","years"]))
print(vocab_check(["nation","nations","state","states"]))
print(vocab_check(["program","programs","computer","computers"]))


{'big': True, 'bigger': True, 'small': True, 'smaller': True}
{'long': True, 'longer': True, 'short': True, 'shorter': True}
{'quick': True, 'quicker': True, 'slow': True, 'slower': True}
{'run': True, 'running': True, 'play': True, 'playing': True}
{'walk': True, 'walking': True, 'swim': True, 'swimming': True}
{'day': True, 'days': True, 'year': True, 'years': True}
{'nation': True, 'nations': True, 'state': True, 'states': True}
{'program': True, 'programs': True, 'computer': True, 'computers': True}


In [None]:
from collections import Counter
# load the same tokenized list-of-lists you trained on
tok = loader.load_text8(chunk_size=256)  # or however you cached it
cnt = Counter(w for s in tok for w in s)
for w in probe:
    print(w, cnt[w])


In [12]:
def keep_iv(quads, word2id):
    iv = []
    for a,b,c,d in quads:
        if all(w in word2id for w in (a,b,c,d)):
            iv.append((a,b,c,d))
    return iv

quads = [
    ("big","bigger","small","smaller"),
    ("long","longer","short","shorter"),
    ("run","running","play","playing"),
    ("day","days","year","years"),
    ("nation","nations","state","states"),
    ("program","programs","computer","computers")
]
iv_quads = keep_iv(quads, model.word2id)
print("IV analogies kept:", len(iv_quads), "/", len(quads))


IV analogies kept: 6 / 6


In [13]:

tests = [
    ("big","bigger","small","smaller"),
    ("long","longer","short","shorter"),
    ("quick","quicker","slow","slower"),
    ("run","running","play","playing"),
    ("walk","walking","swim","swimming"),
    ("day","days","year","years"),
    ("nation","nations","state","states"),
    ("program","programs","computer","computers"),
]
for a,b,c,d in tests:
    print(f"{a}:{b} :: {c}:?")
    print("Predicted:", analogy_3cosadd(model,a,b,c,topn=10))
    print("Expected:", d)
    print()


big:bigger :: small:?
Predicted: [('small', 0.9946258068084717), ('large', 0.9945324063301086), ('low', 0.9898232817649841), ('relatively', 0.9897944927215576), ('water', 0.9895567893981934), ('areas', 0.9892217516899109), ('range', 0.9883784651756287), ('lower', 0.9883126020431519), ('higher', 0.9882976412773132), ('larger', 0.9882911443710327)]
Expected: smaller

long:longer :: short:?
Predicted: [('collection', 0.9951975345611572), ('genre', 0.9951460361480713), ('versions', 0.9950860738754272), ('cards', 0.9950084686279297), ('publish', 0.9949991106987), ('molecular', 0.994952380657196), ('memory', 0.9949322938919067), ('abstract', 0.9949235320091248), ('ethernet', 0.9949174523353577), ('aspects', 0.9949017763137817)]
Expected: shorter

quick:quicker :: slow:?
Predicted: [('quicker', 0.9981658458709717), ('slow', 0.9980124235153198), ('decrease', 0.9976077079772949), ('strategies', 0.9975178837776184), ('rainfall', 0.9974513053894043), ('unemployment', 0.9974422454833984), ('oil', 

In [None]:
from pretraining.utils.benchmark_datasets import EmbeddingBenchmarkDatasets
from pretraining.utils.evaluation.eval_intrinsic import evaluate_embeddings
from skipgram import SkipGramSGNS
from pathlib import Path
import json
model = SkipGramSGNS.load('../../results/skipgram')
word_vecs = model.export_word_vectors(which="combined")
loader = EmbeddingBenchmarkDatasets(data_dir="datasets")
analogy_pairs = loader.get_word_analogy_pairs()
similarity_pairs = loader.get_word_similarity_pairs()

# convert our (word -> vector) dict form from your export_word_vectors()
# NOTE: this returns dict[str, np.ndarray]
intrinsic_results = evaluate_embeddings(
    word_vectors=word_vecs,
    similarity_pairs=[(*p, 0.0) if len(p)==2 else p for p in similarity_pairs],  # our similarity had gold score included already in your new updated pairs
    analogy_quads=[tuple(a) for a in analogy_pairs],
    topk=1,
    lowercase=True
)

# save
model_dir = Path("../../results/skipgram")
intrinsic_path = model_dir / "intrinsic_eval.json"
with open(intrinsic_path, "w") as f:
    json.dump(intrinsic_results, f, indent=2)
print("[SkipGram] Intrinsic Eval:", intrinsic_results)

In [None]:
from cbow import CBOWSGNS
model = CBOWSGNS.load('../../results/cbow')

word_vecs = model.export_word_vectors(which="combined")
loader = EmbeddingBenchmarkDatasets(data_dir="datasets")
analogy_pairs = loader.get_word_analogy_pairs()
similarity_pairs = loader.get_word_similarity_pairs()

# convert our (word -> vector) dict form from your export_word_vectors()
# NOTE: this returns dict[str, np.ndarray]
intrinsic_results = evaluate_embeddings(
    word_vectors=word_vecs,
    similarity_pairs=[(*p, 0.0) if len(p)==2 else p for p in similarity_pairs],  # our similarity had gold score included already in your new updated pairs
    analogy_quads=[tuple(a) for a in analogy_pairs],
    topk=1,
    lowercase=True
)

# save
model_dir = Path("../../results/cbow")
intrinsic_path = model_dir / "intrinsic_eval.json"
with open(intrinsic_path, "w") as f:
    json.dump(intrinsic_results, f, indent=2)
print("[CBOW] Intrinsic Eval:", intrinsic_results)

In [None]:
probe_words = ["hydrogen","oxygen","carbon","germany","france","europe","napoleon","hitler","allies","axis", "christianity","islam","buddhism","judaism","gravity","relativity","quantum","electron","proton","neutron"]
for w in probe_words:
    print(w, most_similar(model,w,topn=10))


In [14]:
from sklearn.metrics.pairwise import cosine_similarity as cosine
def category_coherence(words):
    import numpy as np
    sims=[]
    for i in range(len(words)):
        for j in range(i+1,len(words)):
            if words[i] in model.word2id and words[j] in model.word2id:
                v1 = model.W_in[model.word2id[words[i]]]
                v2 = model.W_in[model.word2id[words[j]]]
                sims.append(cosine(v1.reshape(1,-1),v2.reshape(1,-1))[0][0])
    return np.mean(sims)

religions = ["christianity","islam","judaism","buddhism"]
ww2 = ["hitler","napoleon","troops","army","germany"]
math = ["geometry","algebra","calculus","equation","theorem"]

print("Religions coherence:", category_coherence(religions))
print("WW2 coherence:", category_coherence(ww2))
print("Math coherence:", category_coherence(math))


Religions coherence: 0.62820596
WW2 coherence: 0.6164529
Math coherence: 0.7367727


In [15]:
countries = ["poland", "germany", "france", "spain", "italy"]
instruments = ["piano", "guitar", "drums", "violin"]
sports = ["football", "basketball", "baseball", "tennis"]
tech_companies = ["apple", "microsoft", "ibm", "intel", "google"]
elements = ["hydrogen", "oxygen", "nitrogen", "carbon"]
royalty = ["king", "queen", "prince", "duke", "emperor"]

print("Countries coherence:", category_coherence(countries))
print("Instruments coherence:", category_coherence(instruments))
print("Sports coherence:", category_coherence(sports))
print("Tech Companies coherence:", category_coherence(tech_companies))
print("Elements coherence:", category_coherence(elements))
print("Royalty coherence:", category_coherence(royalty))

Countries coherence: 0.84353924
Instruments coherence: 0.60676914
Sports coherence: 0.84160763
Tech Companies coherence: 0.5220221
Elements coherence: 0.756093
Royalty coherence: 0.75009465


In [None]:
probe_words = ["king","paris","computer","apple","war","music"]
for w in probe_words:
    print(w, most_similar(model,w,topn=10))


In [16]:
def cluster_coherence(word, topn=10):
    sims = most_similar(model,word,topn)
    words = [w for w,_ in sims]
    vecs = np.array([model.W_in[model.word2id[w]] for w in words])
    S = np.corrcoef(vecs @ vecs.T)
    return np.mean(S)

for w in ["king","paris","computer"]:
    print(w, cluster_coherence(w))


king 0.8834876484952464
paris 0.8720277915100092
computer 0.8057821944022688
