# CSI 5386 - Assignment 1
## Group 16
## Part B - Evaluation of Word Embeddings

In [1]:
from web.datasets.similarity import fetch_MTurk, fetch_MEN, fetch_WS353, fetch_RG65, fetch_RW, fetch_SimLex999, fetch_TR9856
from web.embeddings import fetch_SG_GoogleNews, fetch_HDC, fetch_PDC, fetch_HPCA, fetch_LexVec, fetch_conceptnet_numberbatch
from web.evaluate import evaluate_similarity

In [3]:
import pandas as pd
import logging
from six import iteritems

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

In [7]:
results = {
    "MTurk": [],
    "MEN": [],
    "WS353": [],
    "Rubenstein and Goodenough": [],
    "Rare Words": [],
    "SIMLEX999": [],
    "TR9856": [],
    "Average": []
}

similarity_datasets = {
    "MTurk": fetch_MTurk(),
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "Rubenstein and Goodenough": fetch_RG65(),
    "Rare Words": fetch_RW(),
    "SIMLEX999": fetch_SimLex999(),
    "TR9856": fetch_TR9856()
}

def similarity_task(embedding, datasets):
    avg = 0.0
    for name, data in iteritems(datasets):
        # REFERENCE: https://github.com/kudkudak/word-embeddings-benchmarks/
        print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))
    for name, data in iteritems(datasets):
        sim = evaluate_similarity(embedding, data.X, data.y)
        avg += sim
        results[name].append(sim)
        #print(results[name])
    avg = avg/(len(datasets)*1.0)
    results['Average'].append(avg)  

In [9]:
#WE1
w_googlenews = fetch_SG_GoogleNews(clean_words = True)

02:11:45 INFO:loading projection weights from /Users/ranjan/web_data/embeddings/GoogleNews-vectors-negative300.bin.gz
02:11:45 INFO:Loading #3000000 words with 300 dim


File already downloaded, skipping
























































































































































































































































































02:14:08 INFO:Transformed 3000000 into 2967374 words
02:14:38 INFO:Transformed 2967374 into 2967374 words


In [10]:
similarity_task(w_googlenews, similarity_datasets)



Sample data from MTurk: pair "episcopal" and "russia" is assigned score 5.5
Sample data from TR9856: pair "video" and "violent video games" is assigned score 0.7
Sample data from Rare Words: pair "squishing" and "squirt" is assigned score 5.88
Sample data from MEN: pair "sun" and "sunlight" is assigned score [10.]
Sample data from Rubenstein and Goodenough: pair "gem" and "jewel" is assigned score 9.85
Sample data from WS353: pair "love" and "sex" is assigned score 6.77
Sample data from SIMLEX999: pair "old" and "new" is assigned score 1.58




In [11]:
#WE2
w_PDC = fetch_PDC(clean_words = True)

02:14:47 INFO:loading projection weights from /Users/ranjan/web_data/embeddings/pdc/pdc300.txt.bz2


File already downloaded, skipping


02:17:03 INFO:Transformed 388723 into 388723 words
02:17:06 INFO:Transformed 388723 into 388723 words


In [12]:
similarity_task(w_PDC, similarity_datasets)



Sample data from MTurk: pair "episcopal" and "russia" is assigned score 5.5
Sample data from TR9856: pair "video" and "violent video games" is assigned score 0.7
Sample data from Rare Words: pair "squishing" and "squirt" is assigned score 5.88
Sample data from MEN: pair "sun" and "sunlight" is assigned score [10.]
Sample data from Rubenstein and Goodenough: pair "gem" and "jewel" is assigned score 9.85
Sample data from WS353: pair "love" and "sex" is assigned score 6.77
Sample data from SIMLEX999: pair "old" and "new" is assigned score 1.58




In [13]:
#WE3
w_HDC = fetch_HDC(clean_words = True)

02:17:07 INFO:loading projection weights from /Users/ranjan/web_data/embeddings/hdc/hdc300.txt.bz2


File already downloaded, skipping


02:19:30 INFO:Transformed 388723 into 388723 words
02:19:33 INFO:Transformed 388723 into 388723 words


In [14]:
similarity_task(w_HDC, similarity_datasets)



Sample data from MTurk: pair "episcopal" and "russia" is assigned score 5.5
Sample data from TR9856: pair "video" and "violent video games" is assigned score 0.7
Sample data from Rare Words: pair "squishing" and "squirt" is assigned score 5.88
Sample data from MEN: pair "sun" and "sunlight" is assigned score [10.]
Sample data from Rubenstein and Goodenough: pair "gem" and "jewel" is assigned score 9.85
Sample data from WS353: pair "love" and "sex" is assigned score 6.77
Sample data from SIMLEX999: pair "old" and "new" is assigned score 1.58




In [15]:
#WE4
w_lexVec = fetch_LexVec(which="commoncrawl-W+C", clean_words = True)

02:19:35 INFO:loading projection weights from /Users/ranjan/web_data/embeddings/lexvec.commoncrawl.300d.W%2BC.pos.vectors.gz


File already downloaded, skipping


02:28:17 INFO:Transformed 2000000 into 1999999 words
02:28:33 INFO:Transformed 1999999 into 1999999 words


In [16]:
similarity_task(w_lexVec, similarity_datasets)

Sample data from MTurk: pair "episcopal" and "russia" is assigned score 5.5
Sample data from TR9856: pair "video" and "violent video games" is assigned score 0.7
Sample data from Rare Words: pair "squishing" and "squirt" is assigned score 5.88
Sample data from MEN: pair "sun" and "sunlight" is assigned score [10.]
Sample data from Rubenstein and Goodenough: pair "gem" and "jewel" is assigned score 9.85
Sample data from WS353: pair "love" and "sex" is assigned score 6.77
Sample data from SIMLEX999: pair "old" and "new" is assigned score 1.58




In [17]:
#WE5
w_numbatch = fetch_conceptnet_numberbatch(clean_words = True)

02:28:40 INFO:loading projection weights from /Users/ranjan/web_data/embeddings/numberbatch-en-17.06.txt.gz


File already downloaded, skipping




02:30:18 INFO:Transformed 417194 into 416811 words
02:30:21 INFO:Transformed 416811 into 416811 words


In [18]:
similarity_task(w_numbatch, similarity_datasets)



Sample data from MTurk: pair "episcopal" and "russia" is assigned score 5.5
Sample data from TR9856: pair "video" and "violent video games" is assigned score 0.7
Sample data from Rare Words: pair "squishing" and "squirt" is assigned score 5.88
Sample data from MEN: pair "sun" and "sunlight" is assigned score [10.]
Sample data from Rubenstein and Goodenough: pair "gem" and "jewel" is assigned score 9.85
Sample data from WS353: pair "love" and "sex" is assigned score 6.77
Sample data from SIMLEX999: pair "old" and "new" is assigned score 1.58




In [26]:
df = pd.DataFrame(results).T
df = df.loc[['MTurk','MEN','WS353','Rubenstein and Goodenough','Rare Words','SIMLEX999','TR9856','Average']]
df.reset_index()
df["Best Score"] = df.max(axis=1)
df

Unnamed: 0,0,1,2,3,4,Best Score
MTurk,0.681332,0.672333,0.65767,0.711555,0.717486,0.717486
MEN,0.758531,0.772648,0.760335,0.809187,0.852869,0.852869
WS353,0.700017,0.733431,0.716873,0.692889,0.753426,0.753426
Rubenstein and Goodenough,0.760783,0.790069,0.805805,0.764542,0.90988,0.90988
Rare Words,0.497105,0.472393,0.463447,0.489417,0.544665,0.544665
SIMLEX999,0.441966,0.426882,0.406832,0.419321,0.649815,0.649815
TR9856,0.179013,0.206839,0.207092,0.12028,0.129749,0.207092
Average,0.574106,0.582085,0.574008,0.572456,0.651127,0.651127
