In [1]:
import nltk
from nltk.corpus import brown, stopwords
import string

from collections import Counter
from nltk.util import ngrams
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr


In [2]:
punctuation = [p for p in string.punctuation]


stop = stopwords.words('english')

In [3]:
TABLE1 = ['brother', 'asylum', 'cord', 'signature', 'gem', 'bird', 'jewel', 'woodland', 'tumbler', 'graveyard', 'fruit', 'mound', 'wizard', 'pillow', 'rooster', 'automobile', 'magician', 'cushion', 'shore', 'stove', 'slave', 'cock', 'furnace', 'madhouse', 'car', 'midday', 'boy', 'cemetery', 'hill', 'oracle', 'coast', 'crane', 'autograph', 'food', 'voyage', 'serf', 'string', 'implement', 'forest', 'grin', 'sage', 'lad', 'glass', 'noon', 'monk', 'smile', 'tool']

PAIRS = [('cord', 'smile', 0.02), ('rooster, voyage', 0.04), ('noon', 'string', 0.04), ('fruit', 'furnace' ,0.05) , ('autograph', 'shore', 0.06),
         ('automobile', 'wizard', 0.11), ('mound', 'stove', 0.14), ('grin', 'implement', 0.18), ('asylum', 'fruit', 0.19), ('asylum', 'monk', 0.39),
         ('graveyard', 'madhouse', 0.42), ('glass','magician', 0.44), ('boy', 'rooster', 0.44), ('cushion', 'jewel' , 0.45), ('monk', 'slave', 0.57),
         ('asylum', 'cemetery', 0.79), ('coast', 'forest', 0.85), ('grin', 'land', 0.88), ('shore', 'woodland', 0.9), ('monk', 'oracle', 0.91), ('boy', 'sage', .96),
         ('automobile', 'cushion', 0.97), ('mound', 'shore', 0.97), ('lad', 'wizard', 0.99), ('forest', 'graveyard', 1), ('food', 'rooster', 1.09), ('cemetery', 'woodland', 1.24),
         ('coast', 'hill', 1.26), ('furnace', 'implement',1.37), ('crane', 'rooster', 1.41), ('hill', 'woodland', 1.48), ('car', 'journey', 1.55), ('cemetery', 'mound', 1.69),
         ('glass', 'jewel', 1.78), ('magician', 'oracle', 1.82), ('crane', 'implement', 2.37), ('brother', 'lad', 2.41), ('sage', 'wizard', 2.46),
         ('oracle', 'sage', 2.61), ('bird', 'crane', 2.63), ('food', 'fruit', 2.69), ('brother', 'monk', 2.74), ('asylum','madhouse', 3.04), ('furnace', 'stove', 3.11), ('magician', 'wizard', 3.21),
         ('hill', 'mound', 3.29), ('cord', 'string', 3.41), ('glass', 'tumbler', 3.45), ('grin', 'smile', 3.46), ('serf', 'slave', 3.46), ('journey', 'voyage', 3.58), ('autograph', 'signature', 3.59),
         ('coast', 'shore', 3.6), ('forest', 'woodland', 3.65), ('implement', 'tool',3.66), ('cock', 'rooster', 3.68), ('boy', 'lad', 3.82), ('cushion','pillow', 3.84),
         ('cemetery', 'graveyard', 3.88), ('automobile', 'car', 3.92), ('midday', 'noon', 3.94), ('gem', 'jewel', 3.94)]



In [6]:
def in_brown_corpus(corpus, w):
    return w in corpus

def get_brown_frequencies(n = 5000):
    freq_dict = nltk.FreqDist(w for w in brown.words())
    freq_words = freq_dict.most_common(n)
    w = [item[0] for item in freq_words if item[0] not in punctuation and item[0] not in stop]
    least_common_words = w[-5:]
    print('Brown corpus least common words', least_common_words)
    return w, least_common_words

def get_bigrams(W):
    bigrams = ngrams(brown.words(), 2)
    bigrams_freq = Counter(bigrams)
    W_bigrams = np.zeros(shape=(len(W), len(W)))
    for i ,w1 in enumerate(W):
        for j, w2 in enumerate(W):
            W_bigrams[i, j] = bigrams_freq[(w1, w2)] + 1 #1-smoothing

    return W_bigrams



def get_lsa(M):
    pca = PCA(n_components=10)
    M2_10 = pca.fit_transform(M)

    pca = PCA(n_components=100)
    M2_100 = pca.fit_transform(M)


    pca = PCA(n_components=300)
    M2_300 = pca.fit_transform(M)

    return M2_10, M2_100, M2_300


def get_ppmi(M1):


    #ppmi = max (0, log (p (y | x ) / p(y))
    # p(y | x)
    row_sum = M1.sum(axis= 0)
    
    p_y_x = M1 / row_sum

    col_sum = M1.sum(axis = 1)
    p_y = col_sum / col_sum.sum()


    ppmi = np.maximum(0, np.log(p_y_x / p_y))
    return ppmi


def get_similarity(M, W, pairs):
    similarities =[]
    for item in pairs:
        index1 = W.index(item[0])
        index2 = W.index(item[1])
        similarity = cosine_similarity([M[index1]], [M[index2]])[0][0]
        similarities.append((item[0], item[1], similarity))

    return similarities


def get_correlation(pairs1, pairs2):
    x = []
    y = []
    for item1, item2 in zip(pairs1, pairs2):
        x.append(item1[2])
        y.append(item2[2])

    return pearsonr(x, y)








In [7]:
#Step 2
W, least_common_words = get_brown_frequencies()

Brown corpus least common words ['expanded', 'emphasize', 'Manhattan', 'temporarily', 'puts']


In [8]:
new_words = [w for w in TABLE1 if not in_brown_corpus(W, w)]
W = W + new_words
n = len(W)
n

4880

In [9]:
#step 3
#getting word-context vector
M1 = get_bigrams(W)

In [10]:
#step 4
#getting ppmi
M1_plus = get_ppmi(M1)

In [11]:
#step 5
M2_10 , M2_100, M2_300 = get_lsa(M1_plus)

In [12]:
#step 6
#pairs
P = [item for item in PAIRS if item[0] in W and item[1] in W]

P

[('cord', 'smile', 0.02),
 ('noon', 'string', 0.04),
 ('fruit', 'furnace', 0.05),
 ('autograph', 'shore', 0.06),
 ('automobile', 'wizard', 0.11),
 ('mound', 'stove', 0.14),
 ('grin', 'implement', 0.18),
 ('asylum', 'fruit', 0.19),
 ('asylum', 'monk', 0.39),
 ('graveyard', 'madhouse', 0.42),
 ('glass', 'magician', 0.44),
 ('boy', 'rooster', 0.44),
 ('cushion', 'jewel', 0.45),
 ('monk', 'slave', 0.57),
 ('asylum', 'cemetery', 0.79),
 ('coast', 'forest', 0.85),
 ('grin', 'land', 0.88),
 ('shore', 'woodland', 0.9),
 ('monk', 'oracle', 0.91),
 ('boy', 'sage', 0.96),
 ('automobile', 'cushion', 0.97),
 ('mound', 'shore', 0.97),
 ('lad', 'wizard', 0.99),
 ('forest', 'graveyard', 1),
 ('food', 'rooster', 1.09),
 ('cemetery', 'woodland', 1.24),
 ('coast', 'hill', 1.26),
 ('furnace', 'implement', 1.37),
 ('crane', 'rooster', 1.41),
 ('hill', 'woodland', 1.48),
 ('car', 'journey', 1.55),
 ('cemetery', 'mound', 1.69),
 ('glass', 'jewel', 1.78),
 ('magician', 'oracle', 1.82),
 ('crane', 'implement',

In [13]:
#Step 7
#getting similarities
S_M1 = get_similarity(M1, W, P)
S_M1_plus = get_similarity(M1_plus, W, P)
S_M2_10 = get_similarity(M2_10, W, P)
S_M2_100 = get_similarity(M2_100, W, P)
S_M2_300 = get_similarity(M2_300, W, P)

In [14]:
S_M1

[('cord', 'smile', 0.9989815513643365),
 ('noon', 'string', 0.9989781528882894),
 ('fruit', 'furnace', 0.9995908556328321),
 ('autograph', 'shore', 0.9965543623117976),
 ('automobile', 'wizard', 0.9952859562765184),
 ('mound', 'stove', 0.9997952078640164),
 ('grin', 'implement', 0.9998976196739395),
 ('asylum', 'fruit', 0.9996933303814922),
 ('asylum', 'monk', 0.99938807094304),
 ('graveyard', 'madhouse', 0.9999999999999944),
 ('glass', 'magician', 0.9966781286469948),
 ('boy', 'rooster', 0.9783204794030969),
 ('cushion', 'jewel', 0.9997953965546035),
 ('monk', 'slave', 0.9977599826847963),
 ('asylum', 'cemetery', 0.9998976196739395),
 ('coast', 'forest', 0.9946503878579214),
 ('grin', 'land', 0.9892898265880599),
 ('shore', 'woodland', 0.9965543623117976),
 ('monk', 'oracle', 0.99938807094304),
 ('boy', 'sage', 0.9782174844972484),
 ('automobile', 'cushion', 0.9950800742908119),
 ('mound', 'shore', 0.9964518339763868),
 ('lad', 'wizard', 0.9999999999999944),
 ('forest', 'graveyard', 0

In [15]:
S_M1_plus

[('cord', 'smile', 0.016962436970403748),
 ('noon', 'string', 0.009904477174215293),
 ('fruit', 'furnace', 0.027093028399408206),
 ('autograph', 'shore', 0.07264154208170073),
 ('automobile', 'wizard', 0.05589987777998079),
 ('mound', 'stove', 0.03927941579164186),
 ('grin', 'implement', 0.19860535089717707),
 ('asylum', 'fruit', 0.1386949305245328),
 ('asylum', 'monk', 0.08978740719866643),
 ('graveyard', 'madhouse', 0.9999999999999991),
 ('glass', 'magician', 0.07741801887413946),
 ('boy', 'rooster', 0.05248623173379396),
 ('cushion', 'jewel', 0.14718775653722987),
 ('monk', 'slave', 0.006058687103052774),
 ('asylum', 'cemetery', 0.9999999999999991),
 ('coast', 'forest', 0.06756857202257152),
 ('grin', 'land', 0.057119823283641485),
 ('shore', 'woodland', 0.07264154208170073),
 ('monk', 'oracle', 0.08978740719866643),
 ('boy', 'sage', 0.05248623173379396),
 ('automobile', 'cushion', 0.008021180646739637),
 ('mound', 'shore', 0.014024452265850182),
 ('lad', 'wizard', 0.999999999999999

In [16]:
S_M2_10

[('cord', 'smile', 0.45577424194097405),
 ('noon', 'string', 0.9733577895598542),
 ('fruit', 'furnace', 0.9535371701033949),
 ('autograph', 'shore', 0.9793053494529033),
 ('automobile', 'wizard', 0.4865365131225633),
 ('mound', 'stove', 0.9844988795610423),
 ('grin', 'implement', 0.9897515966255357),
 ('asylum', 'fruit', 0.9599557299789397),
 ('asylum', 'monk', 0.45142757160929314),
 ('graveyard', 'madhouse', 0.9999999999999998),
 ('glass', 'magician', 0.6994217986865672),
 ('boy', 'rooster', -0.6001931310206239),
 ('cushion', 'jewel', 0.9961445348235589),
 ('monk', 'slave', 0.09031525490026938),
 ('asylum', 'cemetery', 0.9999999999999998),
 ('coast', 'forest', 0.43440256363546514),
 ('grin', 'land', -0.728992329321353),
 ('shore', 'woodland', 0.9793053494529033),
 ('monk', 'oracle', 0.45142757160929314),
 ('boy', 'sage', -0.6001931310206239),
 ('automobile', 'cushion', 0.5282165200950413),
 ('mound', 'shore', 0.979240273781126),
 ('lad', 'wizard', 0.9999999999999998),
 ('forest', 'gra

In [17]:
S_M2_100

[('cord', 'smile', 0.10786152466001751),
 ('noon', 'string', 0.6650041557044288),
 ('fruit', 'furnace', 0.6856452340937761),
 ('autograph', 'shore', 0.7437825476785465),
 ('automobile', 'wizard', 0.16766622726909794),
 ('mound', 'stove', 0.8759670697849667),
 ('grin', 'implement', 0.7525038929341152),
 ('asylum', 'fruit', 0.7588893023225667),
 ('asylum', 'monk', 0.22007116922676043),
 ('graveyard', 'madhouse', 0.9999999999999998),
 ('glass', 'magician', 0.12245618035674846),
 ('boy', 'rooster', -0.3623230619272046),
 ('cushion', 'jewel', 0.8810494816327551),
 ('monk', 'slave', -0.00019826021951712671),
 ('asylum', 'cemetery', 0.9999999999999997),
 ('coast', 'forest', 0.30201948259757444),
 ('grin', 'land', -0.30036229483234006),
 ('shore', 'woodland', 0.7437825476785466),
 ('monk', 'oracle', 0.22007116922676054),
 ('boy', 'sage', -0.3623230619272047),
 ('automobile', 'cushion', 0.11908230241322532),
 ('mound', 'shore', 0.6761068950395086),
 ('lad', 'wizard', 0.9999999999999998),
 ('for

In [18]:
S_M2_300

[('cord', 'smile', 0.08734374612346854),
 ('noon', 'string', 0.3592197486488922),
 ('fruit', 'furnace', 0.2324517770323188),
 ('autograph', 'shore', 0.5829082354337567),
 ('automobile', 'wizard', 0.08124950081287755),
 ('mound', 'stove', 0.7232669363384278),
 ('grin', 'implement', 0.5427133055473967),
 ('asylum', 'fruit', 0.5355531446499311),
 ('asylum', 'monk', 0.1643767217469219),
 ('graveyard', 'madhouse', 1.0),
 ('glass', 'magician', 0.0699876896565751),
 ('boy', 'rooster', -0.27896672572639813),
 ('cushion', 'jewel', 0.6297586494936579),
 ('monk', 'slave', 0.01760629559607964),
 ('asylum', 'cemetery', 1.0),
 ('coast', 'forest', 0.19597057323106845),
 ('grin', 'land', -0.19014423158628785),
 ('shore', 'woodland', 0.5829082354337565),
 ('monk', 'oracle', 0.16437672174692256),
 ('boy', 'sage', -0.2789667257263977),
 ('automobile', 'cushion', 0.10698017727116439),
 ('mound', 'shore', 0.4353119369247698),
 ('lad', 'wizard', 0.9999999999999999),
 ('forest', 'graveyard', 0.02173682337740

In [19]:
print('M1',get_correlation(S_M1, P))
print('M1+', get_correlation(S_M1_plus, P))
print('M2_10', get_correlation(S_M2_10, P))
print('M2_100', get_correlation(S_M2_100, P))
print('M2_300', get_correlation(S_M2_300, P))

M1 (0.022130125658850898, 0.8655710867131343)
M1+ (0.14036557486677226, 0.2806042889617433)
M2_10 (0.0519921893384193, 0.6906739968711639)
M2_100 (0.0605958967676733, 0.6427159066274443)
M2_300 (0.09254350730319316, 0.4781014633640243)
