In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import wordnet as wn
import fasttext.util
from scipy.spatial.distance import cosine
from scipy.stats import kendalltau
from gensim.models import KeyedVectors   # for loading .vec format files

import io
import gc


In [2]:
df = pd.read_csv('SimLex-999/SimLex-999.txt', sep='\t')
df

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93
...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75


## 1. Word similarities based on WordNet's *path_similarity*


In [3]:
path_sims = []
missing_idxs = [] # for storing words missing in WordNet

for i, row in df.iterrows():
    
    # identifying the WordNet's POS
    wn_pos = None
    if row['POS'] == 'N':
        wn_pos = wn.NOUN
    elif row['POS'] == 'V':
        wn_pos = wn.VERB
    else:
        wn_pos = wn.ADJ
    
    # extracting all synsets for word1 and word2 with the corresponding pos 
    word1_pos_synsets = [s for s in wn.synsets(row['word1'], pos=wn_pos)]
    word2_pos_synsets = [s for s in wn.synsets(row['word2'], pos=wn_pos)]

    # for storing instances with missing words
    if min(len(word1_pos_synsets), len(word2_pos_synsets)) == 0:
        missing_idxs.append(i)
        path_sims.append(0)  # no such instances though
        continue

    # calculating path similarities for all combinations of synsets for the two words
    all_comb_path_sims = []
    for synset_w1 in word1_pos_synsets:
        for synset_w2 in word2_pos_synsets:
            all_comb_path_sims.append(wn.path_similarity(synset_w1, synset_w2))
    path_sims.append(max(all_comb_path_sims))
        
        
print('Number of pairs with missing words:', len(missing_idxs))

df['WordNet path similarity'] = path_sims
df


Number of pairs with missing words: 0


Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex),WordNet path similarity
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41,0.333333
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67,0.333333
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19,1.000000
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18,0.333333
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93,0.333333
...,...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99,0.333333
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44,0.200000
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97,0.250000
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75,0.333333


## 2. Word similarities based on FastText's word vectors

In [4]:
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model('cc.en.300.bin')




In [5]:
cos_sims = []
missing_idxs = [] # for storing words missing in the model

all_words = ft.get_words()
print('Total number of words in the vocabulary:', len(all_words))

for i, row in df.iterrows():
    
    if (row['word1'] not in all_words) or (row['word2'] not in all_words):
        missing_idxs.append(i) # remembering the idx of the pair
        cos_sims.append(0)     # storing some arbitrary value
        continue
    else:
        word1_vec = ft.get_word_vector(row['word1'])
        word2_vec = ft.get_word_vector(row['word2'])
        cos_sims.append(1 - cosine(word1_vec, word2_vec))  # actually, it is 1-(1-cos_sim)

print('Number of pairs with missing words:', len(missing_idxs))

df['FastText vectors similarity (cc.en.300)'] = cos_sims
df

Total number of words in the vocabulary: 2000000
Number of pairs with missing words: 0


Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex),WordNet path similarity,FastText vectors similarity (cc.en.300)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41,0.333333,0.441964
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67,0.333333,0.704955
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19,1.000000,0.631380
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18,0.333333,0.545871
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93,0.333333,0.486345
...,...,...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99,0.333333,0.229357
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44,0.200000,0.350965
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97,0.250000,0.377495
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75,0.333333,0.297315


In [6]:
# clear memory as the model holds ~7Gb of RAM
del ft
gc.collect()


0

## 3. Word similarities based on WordNet's *wup_similarity*


In [7]:
wup_sims = []
missing_idxs = [] # for storing words missing in WordNet

for i, row in df.iterrows():
    
    # identifying the WordNet's POS
    wn_pos = None
    if row['POS'] == 'N':
        wn_pos = wn.NOUN
    elif row['POS'] == 'V':
        wn_pos = wn.VERB
    else:
        wn_pos = wn.ADJ
    
    # extracting all synsets for word1 and word2 with the corresponding pos 
    word1_pos_synsets = [s for s in wn.synsets(row['word1'], pos=wn_pos)]
    word2_pos_synsets = [s for s in wn.synsets(row['word2'], pos=wn_pos)]

    # for storing instances with missing words
    if min(len(word1_pos_synsets), len(word2_pos_synsets)) == 0:
        missing_idxs.append(i)
        wup_sims.append(0)  # no such instances though
        continue

    # calculating wup similarities for all combinations of synsets for the two words
    all_comb_wup_sims = []
    for synset_w1 in word1_pos_synsets:
        for synset_w2 in word2_pos_synsets:
            all_comb_wup_sims.append(wn.wup_similarity(synset_w1, synset_w2))
    wup_sims.append(max(all_comb_wup_sims))
        
        
print('Number of pairs with missing words:', len(missing_idxs))

df['WordNet WUP similarity'] = wup_sims
df


Number of pairs with missing words: 0


Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex),WordNet path similarity,FastText vectors similarity (cc.en.300),WordNet WUP similarity
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41,0.333333,0.441964,0.500000
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67,0.333333,0.704955,0.500000
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19,1.000000,0.631380,1.000000
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18,0.333333,0.545871,0.500000
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93,0.333333,0.486345,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99,0.333333,0.229357,0.500000
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44,0.200000,0.350965,0.333333
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97,0.250000,0.377495,0.400000
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75,0.333333,0.297315,0.500000


## 4. Word similarities based on fastText's word vectors, other models

In [8]:
# method from the FastText website
# explodes size of the dictionary, leading to program crash (lack of memory)

# def load_vectors(fname):
#     fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
#     n, d = map(int, fin.readline().split())
#     data = {}
#     for line in fin:
#         tokens = line.rstrip().split(' ')
#         data[tokens[0]] = map(float, tokens[1:])
#     return data


In [9]:
model = 'crawl-300d-2M.vec'
ft_vecs = KeyedVectors.load_word2vec_format(model)

In [10]:
cos_sims = []
missing_idxs = []

print('Total number of words in the vocabulary:', len(ft_vecs))

for i, row in df.iterrows():
    
    # obtaining word1 embedding, if exists
    if row['word1'] in ft_vecs.key_to_index:
        word1_vec = ft_vecs[row['word1']]
    else:
        missing_idxs.append(i)
        continue
      
    # getting word2 embedding, if exists
    if row['word2'] in ft_vecs.key_to_index:
        word2_vec = ft_vecs[row['word2']]
    else:
        missing_idxs.append(i)
        continue
        
    cos_sims.append(1-cosine(word1_vec, word2_vec))


print('Number of pairs with missing words:', len(missing_idxs))

df[f'FastText vectors similarity ({model.replace(".vec","")})'] = cos_sims
df


Total number of words in the vocabulary: 1999995
Number of pairs with missing words: 0


Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex),WordNet path similarity,FastText vectors similarity (cc.en.300),WordNet WUP similarity,FastText vectors similarity (crawl-300d-2M)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41,0.333333,0.441964,0.500000,0.504730
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67,0.333333,0.704955,0.500000,0.780194
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19,1.000000,0.631380,1.000000,0.771711
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18,0.333333,0.545871,0.500000,0.554318
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93,0.333333,0.486345,0.500000,0.589305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99,0.333333,0.229357,0.500000,0.263337
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44,0.200000,0.350965,0.333333,0.359698
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97,0.250000,0.377495,0.400000,0.375723
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75,0.333333,0.297315,0.500000,0.321654


In [11]:
del ft_vecs
gc.collect()


0

## 6. Calculating Kendall's tau between the gold standard and obtained scores

In [12]:
print(f"SimLex999 & WordNet path similarity: {kendalltau(df['SimLex999'], df['WordNet path similarity']).statistic:.4}")

SimLex999 & WordNet path similarity: 0.3534


In [13]:
print(f"SimLex999 & FastText Word Vectors Similarity (cc.en.300): {kendalltau(df['SimLex999'], df['FastText vectors similarity (cc.en.300)']).statistic:.4}")

SimLex999 & FastText Word Vectors Similarity (cc.en.300): 0.3301


In [14]:
print(f"SimLex999 & WordNet WUP similarity: {kendalltau(df['SimLex999'], df['WordNet WUP similarity']).statistic:.4}")

SimLex999 & WordNet WUP similarity: 0.3211


In [15]:
print(f"SimLex999 & FastText Word Vectors Similarity (crawl-300d-2M): {kendalltau(df['SimLex999'], df['FastText vectors similarity (crawl-300d-2M)']).statistic:.4}")

SimLex999 & FastText Word Vectors Similarity (crawl-300d-2M): 0.3621


## 7. Additional experimentation

In [16]:
model = 'crawl-300d-2M-subword.vec'
ft_vecs = KeyedVectors.load_word2vec_format(model)

In [17]:
cos_sims = []
missing_idxs = []

print('Total number of words in the vocabulary:', len(ft_vecs))

for i, row in df.iterrows():
    
    # obtaining word1 embedding, if exists
    if row['word1'] in ft_vecs.key_to_index:
        word1_vec = ft_vecs[row['word1']]
    else:
        missing_idxs.append(i)
        continue
      
    # getting word2 embedding, if exists
    if row['word2'] in ft_vecs.key_to_index:
        word2_vec = ft_vecs[row['word2']]
    else:
        missing_idxs.append(i)
        continue
        
    cos_sims.append(1-cosine(word1_vec, word2_vec))


print('Number of pairs with missing words:', len(missing_idxs))

df[f'FastText vectors similarity ({model.replace(".vec","")})'] = cos_sims
df


Total number of words in the vocabulary: 2000000
Number of pairs with missing words: 0


Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex),WordNet path similarity,FastText vectors similarity (cc.en.300),WordNet WUP similarity,FastText vectors similarity (crawl-300d-2M),FastText vectors similarity (crawl-300d-2M-subword)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41,0.333333,0.441964,0.500000,0.504730,0.654177
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67,0.333333,0.704955,0.500000,0.780194,0.795616
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19,1.000000,0.631380,1.000000,0.771711,0.836774
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18,0.333333,0.545871,0.500000,0.554318,0.648004
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93,0.333333,0.486345,0.500000,0.589305,0.703270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99,0.333333,0.229357,0.500000,0.263337,0.422737
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44,0.200000,0.350965,0.333333,0.359698,0.470447
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97,0.250000,0.377495,0.400000,0.375723,0.491910
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75,0.333333,0.297315,0.500000,0.321654,0.481629


In [18]:
print(f"SimLex999 & FastText Word Vectors Similarity (crawl-300d-2M-subword): {kendalltau(df['SimLex999'], df[f'FastText vectors similarity (crawl-300d-2M-subword)']).statistic:.4}")

SimLex999 & FastText Word Vectors Similarity (crawl-300d-2M-subword): 0.338


In [19]:
del ft_vecs
gc.collect()


0

In [20]:
model = 'wiki-news-300d-1M.vec'
ft_vecs = KeyedVectors.load_word2vec_format(model)

In [21]:
cos_sims = []
missing_idxs = []

print('Total number of words in the vocabulary:', len(ft_vecs))

for i, row in df.iterrows():
    
    # obtaining word1 embedding, if exists
    if row['word1'] in ft_vecs.key_to_index:
        word1_vec = ft_vecs[row['word1']]
    else:
        missing_idxs.append(i)
        continue
      
    # getting word2 embedding, if exists
    if row['word2'] in ft_vecs.key_to_index:
        word2_vec = ft_vecs[row['word2']]
    else:
        missing_idxs.append(i)
        continue
        
    cos_sims.append(1-cosine(word1_vec, word2_vec))


print('Number of pairs with missing words:', len(missing_idxs))

df[f'FastText vectors similarity ({model.replace(".vec","")})'] = cos_sims
df


Total number of words in the vocabulary: 999994
Number of pairs with missing words: 0


Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex),WordNet path similarity,FastText vectors similarity (cc.en.300),WordNet WUP similarity,FastText vectors similarity (crawl-300d-2M),FastText vectors similarity (crawl-300d-2M-subword),FastText vectors similarity (wiki-news-300d-1M)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41,0.333333,0.441964,0.500000,0.504730,0.654177,0.653033
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67,0.333333,0.704955,0.500000,0.780194,0.795616,0.768561
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19,1.000000,0.631380,1.000000,0.771711,0.836774,0.701083
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18,0.333333,0.545871,0.500000,0.554318,0.648004,0.663085
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93,0.333333,0.486345,0.500000,0.589305,0.703270,0.645981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99,0.333333,0.229357,0.500000,0.263337,0.422737,0.542489
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44,0.200000,0.350965,0.333333,0.359698,0.470447,0.587319
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97,0.250000,0.377495,0.400000,0.375723,0.491910,0.587173
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75,0.333333,0.297315,0.500000,0.321654,0.481629,0.463984


In [22]:
print(f"SimLex999 & FastText Word Vectors Similarity (wiki-news-300d-1M): {kendalltau(df['SimLex999'], df[f'FastText vectors similarity (wiki-news-300d-1M)']).statistic:.4}")

SimLex999 & FastText Word Vectors Similarity (wiki-news-300d-1M): 0.3223
