In [1]:
import nltk

import numpy as np
import scipy.stats
import scipy.sparse
import pandas as pd
import sklearn.decomposition

In [2]:
# nltk.download('brown')
corpus = list(nltk.corpus.brown.words())

In [3]:
# Extract the 5000 most common English words W based on unigram frequencies.
is_eng = lambda word: any(map(str.isalpha, word))
eng_words = list(filter(is_eng, corpus))

unigram_freq = nltk.FreqDist(nltk.ngrams(eng_words, 1))
W = list(map(lambda x: x[0][0], unigram_freq.most_common(5000)))
vocab = {word: idx for idx, word in enumerate(W)}

# Report the 5 most and least common words in W. 
top_5, bottom_5 = W[:5], W[-5:]
print(top_5, bottom_5)

# Update W by words in Table 1 of RG65.
df = pd.read_csv('./data/table1-rg65.csv', header=None)
for word in pd.concat([df[0], df[1]]):
    if word not in vocab:
        W.append(word)
        vocab[word] = len(vocab)

['the', 'of', 'and', 'to', 'a'] ['figured', 'Family', 'Abel', 'shaking', 'tent']


In [4]:
# Construct word-context vector model M1 by collecting bigram counts.
data, row_ind, col_ind = [], [], []

bigram_freq = nltk.FreqDist(nltk.ngrams(corpus, 2))
for bigram, freq in bigram_freq.items():
    if bigram[0] in vocab and bigram[1] in vocab:
        data.append(freq)
        row_ind.append(vocab[bigram[0]])
        col_ind.append(vocab[bigram[1]])

M1 = scipy.sparse.csr_array((data, (row_ind, col_ind)), shape=(len(vocab), len(vocab)))

In [5]:
# Compute PPMI on M1 as M1+.
total = M1.sum()
word_cnt = M1.sum(axis=1)
context_cnt = M1.sum(axis=0)

joint = M1 / total
marginal_word = word_cnt / total
marginal_context = context_cnt / total

# Avoid dividing by zero, will not affect the result.
marginal_word[marginal_word == 0.] = 1.
marginal_context[marginal_context == 0.] = 1.

mi = joint \
     * np.expand_dims(np.reciprocal(marginal_word), axis=0).T \
     * np.reciprocal(marginal_context)

pmi_data = np.log2(mi.data)
ppmi_data = np.maximum(pmi_data, 0.)
M1Plus = scipy.sparse.csr_array((ppmi_data, (mi.row, mi.col)), shape=mi.shape)

In [6]:
M1 = M1.todense()
M1Plus = M1Plus.todense()

In [7]:
# Construct latent semantic model M2 by applying PCA to M1+.
pca_10 = sklearn.decomposition.PCA(n_components=10)
pca_100 = sklearn.decomposition.PCA(n_components=100)
pca_300 = sklearn.decomposition.PCA(n_components=300)

M2_10 = pca_10.fit_transform(M1Plus)
M2_100 = pca_100.fit_transform(M1Plus)
M2_300 = pca_300.fit_transform(M1Plus)

In [8]:
# Find all pairs of words in Table 1 of RG65 that are also in W.
# Record human-labeled similarities and calculate model-predicted similarities.
def cosine_similarity(vec_1, vec_2):
    norm_1 = np.linalg.norm(vec_1)
    norm_2 = np.linalg.norm(vec_2)

    if norm_1 == 0. or norm_2 == 0.:
        return 0.

    return np.dot(vec_1, vec_2) / (norm_1*norm_2)


x, ys = [], [[], [], [], [], []]
for _, (word_1, word_2, score) in df.iterrows():
    if word_1 in vocab and word_2 in vocab:
        x.append(score)

        ind_1, ind_2 = vocab[word_1], vocab[word_2]
        for idx, matrix in enumerate([M1, M1Plus, M2_10, M2_100, M2_300]):
            ys[idx].append(cosine_similarity(matrix[ind_1], matrix[ind_2]))

for y, matrix_name in zip(ys, ['M1', 'M1Plus', 'M2_10', 'M2_100', 'M2_300']):
    r = scipy.stats.pearsonr(x, y)
    print(f'Pearson Correlation between Human and {matrix_name}:\n{r}')

Pearson Correlation between Human and M1:
PearsonRResult(statistic=0.34275259484379167, pvalue=0.005190853243447476)
Pearson Correlation between Human and M1Plus:
PearsonRResult(statistic=0.2572576769372493, pvalue=0.03856545520393161)
Pearson Correlation between Human and M2_10:
PearsonRResult(statistic=0.20332415786934097, pvalue=0.10427700948731429)
Pearson Correlation between Human and M2_100:
PearsonRResult(statistic=0.322681818672728, pvalue=0.008751880744247407)
Pearson Correlation between Human and M2_300:
PearsonRResult(statistic=0.29889190785068204, pvalue=0.015582299285943483)
