# Gensim word vector visualization of various word vectors

In [1]:
import numpy as np

# Get the interactive Tools for Matplotlib
# %matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.decomposition import PCA

import gensim.downloader as api
from gensim.models import KeyedVectors

For looking at word vectors, I'll use Gensim. We also use it in hw1 for word vectors. Gensim isn't really a deep learning package. It's a package for word and text similarity modeling, which started with (LDA-style) topic models and grew into SVD and neural word representations. But its efficient and scalable, and quite widely used.

I'll use our homegrown Stanford offering of GloVe word vectors. Gensim provides a library of several sets of word vectors that you can easily load. You can find out more about GloVe on [the Glove page](https://nlp.stanford.edu/projects/glove/). I use the 100d vectors below as a balance between speed and smallness vs. quality. If you try out the 50d vectors, they basically work for similarity but clearly aren't as good for analogy problems. If you load the 300d vectors, you'll wait longer, but they're even better than the 100d vectors.

In [2]:
# model = api.load("glove-wiki-gigaword-100")

# word2vec_output = '/home/jia/PycharmProjects/CS224n/GloVe/glove.42B.300d.word2vec.txt'
word2vec_output = '/home/jia/PycharmProjects/CS224n/GloVe/vectors.word2vec.txt'
model = KeyedVectors.load_word2vec_format(word2vec_output, binary=False)
print(type(model))

<class 'gensim.models.keyedvectors.KeyedVectors'>


In [3]:
model['bread']

array([-0.226175, -0.750143, -1.04438 ,  0.672386,  0.207265,  0.177585,
        0.24182 ,  0.148822, -1.514996,  0.20957 , -0.212057,  0.858471,
       -0.521985, -0.366885,  0.855122,  0.523743, -0.506034,  0.916326,
        0.539605,  0.003791, -0.121866, -0.116245, -0.857338, -1.145494,
       -0.275651, -0.525094,  0.402574, -0.257563, -1.189499, -0.451964,
       -0.251207, -0.261346, -0.772089, -1.095427, -0.065201, -0.148957,
        0.281674, -0.350438, -0.658632, -0.407328, -0.102456, -0.769148,
       -0.351928,  0.527741, -0.029045, -0.213448, -0.032432, -0.091268,
        0.079164, -0.333354], dtype=float32)

In [4]:
model['croissant']

In [5]:
model.most_similar('usa')

[('united', 0.7216923832893372),
 ('canada', 0.6587510108947754),
 ('carolina', 0.658103346824646),
 ('america', 0.6471624970436096),
 ('georgia', 0.6444755792617798),
 ('australia', 0.6360775232315063),
 ('california', 0.6339578628540039),
 ('states', 0.6284757256507874),
 ('illinois', 0.6216531991958618),
 ('canton', 0.6193104386329651)]

In [6]:
model.most_similar('banana')

[('daiquiri', 0.7199301719665527),
 ('bolivian', 0.6042724251747131),
 ('knock', 0.5829662680625916),
 ('heck', 0.5539539456367493),
 ('raisin', 0.5375456213951111),
 ('bobble', 0.5256161093711853),
 ('glycerin', 0.5199574828147888),
 ('orb', 0.511772632598877),
 ('dyke', 0.5088387131690979),
 ('bubble', 0.5065319538116455)]

In [7]:
model.most_similar('croissant')

In [8]:
model.most_similar(negative='banana')

[('dogmatic', 0.6157041788101196),
 ('academic', 0.5658379197120667),
 ('preparatory', 0.5604988932609558),
 ('doctorate', 0.5505353808403015),
 ('mba', 0.5357274413108826),
 ('undergraduate', 0.5328283309936523),
 ('degrees', 0.520689845085144),
 ('ae', 0.5090684294700623),
 ('invitation', 0.5082197189331055),
 ('offering', 0.5028907656669617)]

In [9]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

elizabeth: 0.7589


In [10]:
# x1 : x2 :: y1 :: returned
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

![Analogy](imgs/word2vec-king-queen-composition.png)

In [11]:
analogy('man', 'king', 'woman')

'elizabeth'

In [12]:
analogy('australia', 'beer', 'france')

'wine'

In [13]:
analogy('pencil', 'sketching', 'camera')

'compulsion'

In [14]:
analogy('obama', 'clinton', 'reagan')

'bush'

In [15]:
analogy('tall', 'tallest', 'long')

'world'

In [16]:
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

cereal


In [17]:
def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
        
    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.annotate(word, (x, y), xytext=(x+0.05, y+0.05))

In [18]:
display_pca_scatterplot(model, 
    ['coffee', 'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
        'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
        'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard',
        'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
        'france', 'germany', 'hungary', 'luxembourg', 'australia', 'china', 'iran',
        'homework', 'assignment', 'problem', 'exam', 'test', 'class',
        'school', 'college', 'university', 'institute'])

KeyError: "Key 'falafel' not present"

In [None]:
import pandas as pd
df = pd.read_csv("wordsim353/combined.csv", header=0)
df.head(3)

In [None]:
def cosine_similarity(vec1, vec2):
    """计算余弦相似度"""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [None]:
# 存储模型预测相似度
model_scores = []
human_scores = []
missing_words = []
for index, row in df.iterrows():
    word1 = row['Word 1'].lower()  # 统一转小写
    word2 = row['Word 2'].lower()
    
    # 检查词是否存在
    if word1 in model.key_to_index and word2 in model.key_to_index:
        vec1 = model[word1]
        vec2 = model[word2]
        sim = cosine_similarity(vec1, vec2)
        model_scores.append(sim)
        human_scores.append(row['Human (mean)'])
    else:
        missing_words.append((word1, word2))
print(f"有效样本数: {len(model_scores)}/{len(df)}")
print(f"缺失词对示例: {missing_words[:5]}")

In [None]:
from scipy.stats import spearmanr
assert len(model_scores) == len(human_scores)

In [None]:
corr, p_value = spearmanr(human_scores, model_scores)
print(f"Spearman相关系数: {corr:.3f}")
print(f"P值: {p_value:.5e}")