In [1]:
import nltk
import sklearn.decomposition
import scipy.stats
import scipy.sparse
import numpy as np
import pandas as pd

In [2]:
# nltk.download('brown')
all_words = list(map(str.lower, nltk.corpus.brown.words()))
eng_words = list(filter(str.isalpha, all_words))

In [9]:
# Extract the 5000 most frequent words W.
unigrams = nltk.ngrams(eng_words, 1)
unigrams_freq = nltk.FreqDist(unigrams)
W = list(map(lambda x: x[0][0], unigrams_freq.most_common(5000)))
vocab = {word: idx for idx, word in enumerate(W)}

# Report the 5 most and least common words in W. 
top_5, bottom_5 = W[:5], W[-5:]
print(top_5, bottom_5)
print(W[-30:])

# Update W by words in Table 1 of RG65.
df = pd.read_csv('./synonymy.csv')
for word in pd.concat([df.word_1, df.word_2]):
    if word not in vocab and (word,) in unigrams_freq:
        W.append(word)
        vocab[word] = len(vocab)

['the', 'of', 'and', 'to', 'a'] ['vertex', 'rourke', 'killpath', 'haney', 'letch']
['essay', 'distinguish', 'patents', 'therapist', 'damned', 'murderer', 'plantation', 'helion', 'rousseau', 'smelled', 'reactivity', 'tetrachloride', 'sera', 'nonspecific', 'vector', 'vertex', 'rourke', 'killpath', 'haney', 'letch']


In [4]:
# Construct word-context vector model M1.
data, row_ind, col_ind = [], [], []

bigrams = nltk.ngrams(all_words, 2)
bigrams_freq = nltk.FreqDist(bigrams)
for bigram, freq in bigrams_freq.items():
    if bigram[0] in vocab and bigram[1] in vocab:
        data.append(freq)
        row_ind.append(vocab[bigram[0]])
        col_ind.append(vocab[bigram[1]])

M1 = scipy.sparse.csr_array((data, (row_ind, col_ind)), shape=(len(W), len(W)))

In [5]:
# Compute PPMI on M1 as M1+.
tot_sum = M1.sum()
word_sum = M1.sum(axis=1)
context_sum = M1.sum(axis=0)

joint = M1 / tot_sum
word_marginal = word_sum / tot_sum
context_marginal = context_sum / tot_sum

mi = joint \
     * np.expand_dims(np.reciprocal(word_marginal), axis=0).T \
     * np.reciprocal(context_marginal)
pmi_data = np.log2(mi.data)
ppmi_data = np.maximum(pmi_data, 0.)
M1Plus = scipy.sparse.csr_array((ppmi_data, (mi.row, mi.col)), shape=mi.shape).todense()

  * np.expand_dims(np.reciprocal(word_marginal), axis=0).T \
  * np.reciprocal(context_marginal)


In [6]:
# Construct latent semantic model M2 by applying PCA to M1+.
pca_10 = sklearn.decomposition.PCA(n_components=10)
pca_100 = sklearn.decomposition.PCA(n_components=100)
pca_300 = sklearn.decomposition.PCA(n_components=300)

M2_10 = pca_10.fit_transform(M1Plus)
M2_100 = pca_100.fit_transform(M1Plus)
M2_300 = pca_300.fit_transform(M1Plus)

In [7]:
# Find all pairs of words in Table 1 of RG65 that are also in W.
# Record human-labeled similarities and calculate model-predicted similarities.
def cosine_similarity(vec_1, vec_2):
    norm_1 = np.linalg.norm(vec_1)
    norm_2 = np.linalg.norm(vec_2)

    if norm_1 == 0. or norm_2 == 0.:
        return 0.

    return np.dot(vec_1, vec_2) / (norm_1*norm_2)


x, ys = [], [[], [], [], [], []]
for _, (word_1, word_2, score) in df.iterrows():
    if word_1 in vocab and word_2 in vocab:
        x.append(score)

        ind_1, ind_2 = vocab[word_1], vocab[word_2]
        for idx, matrix in enumerate([M1.todense(), M1Plus, M2_10, M2_100, M2_300]):
        # for idx, matrix in enumerate([M1.todense()]):
            ys[idx].append(cosine_similarity(matrix[ind_1], matrix[ind_2]))

for y, matrix_name in zip(ys, ['M1', 'M1Plus', 'M2_10', 'M2_100', 'M2_300']):
# for y, matrix_name in zip(ys, ['M1']):
    r = scipy.stats.pearsonr(x, y)
    print(f'Pearson Correlation between Human and {matrix_name}:\n{r}')

Pearson Correlation between Human and M1:
PearsonRResult(statistic=0.32055637738466136, pvalue=0.009810288765306442)
Pearson Correlation between Human and M1Plus:
PearsonRResult(statistic=0.2031487194875153, pvalue=0.10739872408309742)
Pearson Correlation between Human and M2_10:
PearsonRResult(statistic=0.1904962659203761, pvalue=0.13161122162653538)
Pearson Correlation between Human and M2_100:
PearsonRResult(statistic=0.3734250770679275, pvalue=0.0023713847821895395)
Pearson Correlation between Human and M2_300:
PearsonRResult(statistic=0.35605460741686434, pvalue=0.003882223705338614)
