# Evaluation

Here, we evaluate our trained embeddings on downstream metrics.

In [6]:
import os
import glob
import numpy as np
from tqdm import tqdm
import pandas as pd
from gensim import models
from gensim.models import Word2Vec, KeyedVectors
from collections import Counter
import matplotlib.pyplot as plt
from six import iteritems

from utils import standardize_string, embedding_info, load_word_vectors, most_similar_words
from embedding import Embedding

from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering, KMeans
# from sklearn.datasets.base import Bunch
from sklearn.datasets._base import Bunch

In [10]:
# Helper functions
def get_analogy_vec(embeddings: Word2Vec, word1, word2, word3):
    """"""
    try:
        w1_vec = embeddings.wv[word1]
        w2_vec = embeddings.wv[word2]
        w3_vec = embeddings.wv[word3]
        
        analogy_vec = w1_vec - w2_vec + w3_vec
        return analogy_vec
    except Exception as e:
        return False
        
# Download bencharmks
def download_file(url, save_path):
    """"""
        
# Anology Benchmarks
def get_google_analogy_benchmark():
    """
    Testing both semantic and syntactic analogies.
    """
    dataset = "EN-GOOGLE.txt"
    # If not at location data/EN-GOOGLE.txt, download it
    
    with open("data/EN-GOOGLE.txt", "r") as f:
        L = f.read().splitlines()

    # Simple 4 word analogy questions with categories
    questions = []
    answers = []        
    category = []       # categories of analogies
    cat = None
    for l in L:
        if l.startswith(":"):
            cat =l.lower().split()[1]
        else:
            words =  standardize_string(l).split()
            questions.append(words[0:3])
            answers.append(words[3])
            category.append(cat)
            
            
    print(f"There are {len(questions)} questions")
    print(f"There are {len(set(category))} categories")
    category_distribution = Counter(category)
    print("Disribution", category_distribution)
    return questions, answers, category

def get_msr_benchmark():
    """
    Test performance on syntatic analygies
    """
    with open("data/EN-MSR.txt", "r") as f:
        L = f.read().splitlines()
        
    questions = []
    answers = []
    category = []
    for l in L:
        words = standardize_string(l).split()
        questions.append(words[0:3])
        answers.append(words[4])
        category.append(words[3])

    verb = set([c for c in set(category) if c.startswith("VB")])
    noun = set([c for c in set(category) if c.startswith("NN")])
    category_high_level = []
    for cat in category:
         if cat in verb:
             category_high_level.append("verb")
         elif cat in noun:
             category_high_level.append("noun")
         else:
             category_high_level.append("adjective")
             
    print(f"There are {len(questions)} questions")
    print(f"There are {len(set(category))} categories")
    category_distribution = Counter(category)
    print("Disribution", category_distribution)
    return questions, answers, category

def get_wordrep_benchmark():
    """
    """
    
def get_semeval2012_benchmark():
    """
    """

# Similarity Tasks
def get_ws353_benchmark():
    """
    Test performance on similarity
    """
    path = "./data/WORDSIM353/combined.csv" # or combined.tab
    
    dataset = pd.read_csv(path)
    
    words = []
    human_scores = []
    for _, row in dataset.iterrows():
        word1, word2 = row['Word 1'], row['Word 2']
        human_similarity = row['Human (mean)']

        words.append([word1, word2])
        human_scores.append(human_similarity)
        
    return Bunch(X=words, y=human_scores)
    # return words, human_scores

def get_simlex999_benchmark():
    """
    SimLex999 dataset for testing attributional similarity
    """
    data_path = "./data"
    dataset = "EN-SIM999"
    data = pd.read_csv(f'{data_path}/{dataset}.txt', sep="\t")
    # We basically select all the columns available
    X = data[['word1', 'word2']].values
    y = data['SimLex999'].values
    sd = data['SD(SimLex)'].values
    conc = data[['conc(w1)', 'conc(w2)', 'concQ']].values
    POS = data[['POS']].values
    assoc = data[['Assoc(USF)', 'SimAssoc333']].values
    return Bunch(X=X.astype("object"), y=y, sd=sd, conc=conc, POS=POS, assoc=assoc)
    

def get_RG65_benchmark():
    """
    Rubenstein and Goodenough dataset for testing attributional and
    relatedness similarity
    """
    data_path = "./data"
    dataset = "EN-RG-65"
    data = pd.read_csv(f'{data_path}/{dataset}.txt', header=None, sep="\t").values
    return Bunch(X=data[:, 0:2].astype("object"),
                 y=data[:, 2].astype(float),
                 sd=np.std(data[:, 3:].astype(float)))
   
    
def get_RW_benchmark():
    data_path = "./data"
    dataset = "EN-RW"
    data = pd.read_csv(f'{data_path}/{dataset}.txt', header=None, sep="\t").values
    return Bunch(X=data[:, 0:2].astype("object"),
                 y=data[:, 2].astype(float),
                 sd=np.std(data[:, 3:].astype(float)))
    
def get_MTurk_benchmark():
    """
    MTurk dataset for testing attributional similarity
    """
    data_path = "./data"
    dataset = "EN-TRUK"
    data = pd.read_csv(f'{data_path}/{dataset}.txt', header=None, sep=" ").values
    return Bunch(X=data[:, 0:2].astype("object"),
                 y=2 * data[:, 2].astype(float))
    

# Categorization Tasks
def calculate_purity(y_true, y_pred):
    """
    Calculate purity for given true and predicted cluster labels.

    Parameters
    ----------
    y_true: array, shape: (n_samples, 1)
      True cluster labels

    y_pred: array, shape: (n_samples, 1)
      Cluster assingment.

    Returns
    -------
    purity: float
      Calculated purity.
    """
    assert len(y_true) == len(y_pred)
    true_clusters = np.zeros(shape=(len(set(y_true)), len(y_true)))
    pred_clusters = np.zeros_like(true_clusters)
    for id, cl in enumerate(set(y_true)):
        true_clusters[id] = (y_true == cl).astype("int")
    for id, cl in enumerate(set(y_pred)):
        pred_clusters[id] = (y_pred == cl).astype("int")

    M = pred_clusters.dot(true_clusters.T)
    return 1. / len(y_true) * np.sum(np.max(M, axis=1))

def evaluate_categorization(w, X, y, method="all", seed=None):
    """
    Evaluate embeddings on categorization task.

    Parameters
    ----------
    w: Embedding or dict
      Embedding to test.

    X: vector, shape: (n_samples, )
      Vector of words.

    y: vector, shape: (n_samples, )
      Vector of cluster assignments.

    method: string, default: "all"
      What method to use. Possible values are "agglomerative", "kmeans", "all.
      If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
      hyperparameter tuning to avoid overfitting).
      If "kmeans" is passed, method will fit KMeans.
      In both cases number of clusters is preset to the correct value.

    seed: int, default: None
      Seed passed to KMeans.

    Returns
    -------
    purity: float
      Purity of the best obtained clustering.
    """

    if isinstance(w, dict):
        print("Convert to Embedding")
        w = Embedding.from_dict(w)

    assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method"

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    words = np.vstack([w.get(word, mean_vector) for word in X.flatten()])
    ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False)

    # Evaluate clustering on several hyperparameters of AgglomerativeClustering and
    # KMeans
    best_purity = 0

    if method == "all" or method == "agglomerative":
        best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
                                                                       metric="euclidean",
                                                                       linkage="ward").fit_predict(words[ids]))
        # logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward'))
        for metric in ["cosine", "euclidean"]:
            for linkage in ["average", "complete"]:
                purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
                                                                          metric=metric,
                                                                          linkage=linkage).fit_predict(words[ids]))
                # logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage))
                best_purity = max(best_purity, purity)

    if method == "all" or method == "kmeans":
        purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))).
                                  fit_predict(words[ids]))
        # logger.debug("Purity={:.3f} using KMeans".format(purity))
        best_purity = max(purity, best_purity)

    return best_purity

def get_cluster_assignments(dataset_name, sep=" ", skip_header=False):
    data_dir = "./data"
    files = glob.glob(os.path.join(data_dir, dataset_name + "/*.txt"))
    X = []
    y = []
    names = []
    for cluster_id, file_name in enumerate(files):
        with open(file_name) as f:
            lines = f.read().splitlines()[(int(skip_header)):]

            X += [l.split(sep) for l in lines]
            y += [os.path.basename(file_name).split(".")[0]] * len(lines)
    return Bunch(X=np.array(X, dtype="object"), y=np.array(y).astype("object"))
    
def get_ap_benchmark():
    """
    Test performance on Almuhareb and Abdulrahman categorization dataset
    """
    return get_cluster_assignments("EN-AP")

def get_bless_benchmark():
    """
    Baroni and Marco categorization dataset
    """
    return get_cluster_assignments("EN-BLESS")
  
def get_battig_benchmark():
    """
    Fetch 1969 Battig dataset
    """
    data = get_cluster_assignments("EN-BATTIG", sep=",", skip_header=True)
    return Bunch(X=data.X[:, 0], y=data.y,freq=data.X[:, 1], frequency=data.X[:, 2], rank=data.X[:, 3], rfreq=data.X[:, 4])

def get_essli_2c_benchmark():
    """
    ESSLI 2c task categorization dataset
    """
    return get_cluster_assignments("EN-ESSLI-2c")
    
def get_essli_2b_benchmark():
    """
    ESSLI 2b task categorization dataset
    """
    return get_cluster_assignments("EN-ESSLI-2b")
    
def get_essli_1a_benchmark():
    """
    ESSLI 1a task categorization dataset
    """
    return get_cluster_assignments("EN-ESSLI-1a")

## Daily Dialog

Let's first do some analogy benchmarks.

In [10]:
dailydialog_word2vec = Word2Vec.load('./out/dailydialog_word2vec_100.model')

In [19]:
questions, answers, category = get_google_analogy_benchmark() # google analogy benchmark
# questions, answers, category = get_msr_benchmark() # msr benchmark

subset = range(0, len(questions))
correct_total_count, analogies_attempted = 0, 0
correct_per_category = {}     # Calculate correct per category

for index in tqdm(subset):
    word1, word2, word3 = questions[index]
    answer = answers[index]

    analogy_vec = get_analogy_vec(dailydialog_word2vec, word1, word2, word3)
    if analogy_vec is False:
        continue
    
    # Get similar vectors
    similar_vectors = dailydialog_word2vec.wv.most_similar(positive=[analogy_vec])
    # Considered correct if answer in top K = 5 of most similar
    correct = True if answer in [word for word, _ in similar_vectors[:5]] else False
    
    analogies_attempted += 1
    if correct:
        correct_total_count += 1
        # print("-" * 50)
        # print(f"Category is f{category[index]}")
        # print(f"Q: {word1} is to {word2} as {word3} is to ?")
        # print(f"A: {answer}")
        # print(f"P: {similar_vectors}")
        if category[index] not in correct_per_category:
            correct_per_category[category[index]] = 0
        else:
            correct_per_category[category[index]] += 1

print("Total correct:", correct_total_count)
print("Analogies attempted:", analogies_attempted)
print("Correct_per_category", correct_per_category)

There are 19544 questions
There are 14 categories
Disribution Counter({'capital-world': 4524, 'city-in-state': 2467, 'gram6-nationality-adjective': 1599, 'gram7-past-tense': 1560, 'gram3-comparative': 1332, 'gram8-plural': 1332, 'gram4-superlative': 1122, 'gram5-present-participle': 1056, 'gram1-adjective-to-adverb': 992, 'gram9-plural-verbs': 870, 'currency': 866, 'gram2-opposite': 812, 'capital-common-countries': 506, 'family': 506})


100%|██████████| 19544/19544 [00:15<00:00, 1298.72it/s]

Total correct: 170
Analogies attempted: 8618
Correct_per_category {'capital-common-countries': 3, 'capital-world': 3, 'family': 155, 'gram8-plural': 5}





Next, let's perform a word similarity benchmarks. 

In [51]:
similarity_tasks = {
   "WS353" : get_ws353_benchmark(),
   "SimLex999": get_simlex999_benchmark(),
   "RG65": get_RG65_benchmark(),
   "RW": get_RW_benchmark(),
   "MTurk": get_MTurk_benchmark()
}

similarity_results = {}
for task, data in iteritems(similarity_tasks):
    print(task)
    sim_words, sim_scores = data.X, data.y

    human_scores = []
    model_scores = []
    total_checked = 0

    for i, words in enumerate(sim_words):
        word1, word2 = words[0], words[1]
    
        # Check if words exist in embedding dictionary
        try:
            word1_vec = dailydialog_word2vec.wv[word1]
            word2_vec = dailydialog_word2vec.wv[word2]
            similarity = 10 * cosine_similarity(word1_vec.reshape(1, -1), word2_vec.reshape(1, -1))
            model_similarity = similarity[0][0]     
        except Exception as e:
            # print(e)
            continue
            
        total_checked += 1
        # Store the scores
        human_scores.append(sim_scores[i])
        model_scores.append(model_similarity)
        

    model_correlation, p_value = spearmanr(human_scores, model_scores)
    print("-" * 50)
    print('total checked', total_checked, 'out of', len(sim_scores))
    print('Corr:',model_correlation)
    print('P-Val:',p_value)
    
    similarity_results[task] = model_correlation
    
sim = pd.DataFrame([similarity_results])
print(sim)

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


WS353
--------------------------------------------------
total checked 280 out of 353
Corr: 0.1874229884478066
P-Val: 0.0016320460055392872
SimLex999
--------------------------------------------------
total checked 899 out of 999
Corr: 0.048256811542747344
P-Val: 0.14825320993622756
RG65
--------------------------------------------------
total checked 35 out of 65
Corr: 0.04020170931722092
P-Val: 0.8186423720913868
RW
--------------------------------------------------
total checked 279 out of 2034
Corr: 0.03797309230703268
P-Val: 0.5276117216290732
MTurk
--------------------------------------------------
total checked 186 out of 287
Corr: 0.057520211590356474
P-Val: 0.4354917272912583
      WS353  SimLex999      RG65        RW    MTurk
0  0.187423   0.048257  0.040202  0.037973  0.05752


Next, let's perform classification test benches.

In [34]:
ap_bench = get_ap_benchmark()
bless_bench = get_bless_benchmark()
battig_bench = get_battig_benchmark()
essli_2c_bench = get_essli_2c_benchmark()
essli_2b_bench = get_essli_2b_benchmark()
essli_1a_bench = get_essli_1a_benchmark()

categorization_tasks = {
    "AP": ap_bench,
    "BLESS": bless_bench,
    "BATTIG": battig_bench,
    "ESSLI_2c": essli_2c_bench,
    "ESSLI_2b": essli_2b_bench,
    "ESSLI_1a": essli_1a_bench
}

categorization_results = {}
w = Embedding.from_gensim_word2vec(dailydialog_word2vec)

# Calculate results using helper function
for name, data in iteritems(categorization_tasks):
    categorization_results[name] = evaluate_categorization(w, data.X, data.y)
    
# Construct pd table
cat = pd.DataFrame([categorization_results])
print(cat)
del categorization_tasks

         AP  BLESS    BATTIG  ESSLI_2c  ESSLI_2b  ESSLI_1a
0  0.149254   0.22  0.095775  0.511111      0.45  0.431818


## Google News
Pretrained embeddings.

In [53]:
googlenews_kv: KeyedVectors = models.KeyedVectors.load_word2vec_format(
    './data/GoogleNews-vectors-negative300.bin', binary=True)

print(googlenews_kv)

KeyedVectors<vector_size=300, 3000000 keys>


In [10]:
analogy_tasks = {
    "GOOGLE": "",
    "MSR": ""
}

# questions, answers, category = get_google_analogy_benchmark()
questions, answers, category = get_msr_benchmark()
correct_total_count, analogies_attempted = 0, 0
correct_per_category = {}     # Calculate correct per category
for index in tqdm(range(0, len(questions))):
    word1, word2, word3 = questions[index]
    answer = answers[index]

    try:
        w1_vec = googlenews_kv[word1]
        w2_vec = googlenews_kv[word2]
        w3_vec = googlenews_kv[word3]
        
        analogy_vec = w1_vec - w2_vec + w3_vec
    except Exception as e:
        continue
    
    # Get similar vectors
    similar_vectors = googlenews_kv.most_similar(positive=[analogy_vec])
    # Considered correct if answer in top K = 5 of most similar
    correct = True if answer in [word for word, _ in similar_vectors[:5]] else False
    
    analogies_attempted += 1
    if correct:
        correct_total_count += 1
        if category[index] not in correct_per_category:
            correct_per_category[category[index]] = 0
        else:
            correct_per_category[category[index]] += 1

print("Total correct:", correct_total_count)
print("Analogies attempted:", analogies_attempted)
print("Correct_per_category", correct_per_category)

There are 8000 questions
There are 16 categories
Disribution Counter({'jj_jjr': 500, 'jjr_jj': 500, 'jj_jjs': 500, 'jjs_jj': 500, 'jjs_jjr': 500, 'jjr_jjs': 500, 'nn_nns': 500, 'nns_nn': 500, 'nn_nnpos': 500, 'nnpos_nn': 500, 'vb_vbd': 500, 'vbd_vb': 500, 'vb_vbz': 500, 'vbz_vb': 500, 'vbz_vbd': 500, 'vbd_vbz': 500})


100%|██████████| 8000/8000 [00:00<00:00, 507961.79it/s]

Total correct: 0
Analogies attempted: 0
Correct_per_category {}





Let's do some similarity testing

In [55]:
similarity_tasks = {
   "WS353" : get_ws353_benchmark(),
   "SimLex999": get_simlex999_benchmark(),
   "RG65": get_RG65_benchmark(),
   "RW": get_RW_benchmark(),
   "MTurk": get_MTurk_benchmark()
}

similarity_results = {}
for task, data in iteritems(similarity_tasks):
    sim_words, sim_scores = data.X, data.y

    human_scores = []
    model_scores = []
    total_checked = 0

    for i, words in enumerate(sim_words):
        word1, word2 = words[0], words[1]
    
        # Check if words exist in embedding dictionary
        try:
            word1_vec = googlenews_kv[word1]
            word2_vec = googlenews_kv[word2]
            similarity = 10 * cosine_similarity(word1_vec.reshape(1, -1), word2_vec.reshape(1, -1))
            model_similarity = similarity[0][0]     
        except Exception as e:
            continue
            
        total_checked += 1
        # Store the scores
        human_scores.append(sim_scores[i])
        model_scores.append(model_similarity)
        

    model_correlation, p_value = spearmanr(human_scores, model_scores)
    print("-" * 50)
    print(task)
    print('total checked', total_checked, 'out of', len(sim_scores))
    print('Corr:',model_correlation)
    print('P-Val:',p_value)
    
    similarity_results[task] = model_correlation
    
sim = pd.DataFrame([similarity_results])
print(sim)

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


--------------------------------------------------
WS353
total checked 353 out of 353
Corr: 0.7000166486272194
P-Val: 2.86866666051422e-53
--------------------------------------------------
SimLex999
total checked 999 out of 999
Corr: 0.44196551091403796
P-Val: 5.068221892023142e-49
--------------------------------------------------
RG65
total checked 65 out of 65
Corr: 0.7607828603850846
P-Val: 1.9330285740005686e-13
--------------------------------------------------
RW
total checked 1825 out of 2034
Corr: 0.5342097582319317
P-Val: 3.409254222298205e-135
--------------------------------------------------
MTurk
total checked 275 out of 287
Corr: 0.6839689831303845
P-Val: 2.8467829183547183e-39
      WS353  SimLex999      RG65       RW     MTurk
0  0.700017   0.441966  0.760783  0.53421  0.683969


Let's do some classification tests

In [6]:
categorization_tasks = {
    "AP": get_ap_benchmark(),
    "BLESS": get_bless_benchmark(),
    "BATTIG": get_battig_benchmark(),
    "ESSLI_2c": get_essli_2c_benchmark(),
    "ESSLI_2b": get_essli_2b_benchmark(),
    "ESSLI_1a": get_essli_1a_benchmark()
}

categorization_results = {}
w = Embedding.from_gensim_keyedvectors(googlenews_kv, pretrained=True)

# Calculate results using helper function
for name, data in iteritems(categorization_tasks):
    try: 
        categorization_results[name] = evaluate_categorization(w, data.X, data.y)
    except Exception as e:
        print(name, e)
        continue
    
# Construct pd table
cat = pd.DataFrame([categorization_results])
print(cat)
del categorization_tasks

         AP  BLESS    BATTIG  ESSLI_2c  ESSLI_2b  ESSLI_1a
0  0.639303   0.79  0.382527  0.644444       0.8      0.75


# Text8
Now let's evaluate the text8 dataset

In [16]:
text8_vectors = Word2Vec.load('./out/text8_word2vec_100.model')
embedding_info(text8_vectors)

Vector size: 100
Dictionary size 253854
Window size 5
Total training time 50.620567683596164


In [10]:
questions, answers, category = get_google_analogy_benchmark() # google analogy benchmark
# questions, answers, category = get_msr_benchmark() # msr benchmark

subset = range(0, len(questions))
correct_total_count, analogies_attempted = 0, 0
correct_per_category = {}     # Calculate correct per category

for index in tqdm(subset):
    word1, word2, word3 = questions[index]
    answer = answers[index]

    analogy_vec = get_analogy_vec(text8_vectors, word1, word2, word3)
    if analogy_vec is False:
        continue
    
    similar_vectors = text8_vectors.wv.most_similar(positive=[analogy_vec])
    # Considered correct if answer in top K = 5 of most similar
    correct = True if answer in [word for word, _ in similar_vectors[:5]] else False
    
    analogies_attempted += 1
    if correct:
        correct_total_count += 1
        if category[index] not in correct_per_category:
            correct_per_category[category[index]] = 0
        else:
            correct_per_category[category[index]] += 1

print("Total correct:", correct_total_count)
print("Analogies attempted:", analogies_attempted)
print("Correct_per_category", correct_per_category)

There are 19544 questions
There are 14 categories
Disribution Counter({'capital-world': 4524, 'city-in-state': 2467, 'gram6-nationality-adjective': 1599, 'gram7-past-tense': 1560, 'gram3-comparative': 1332, 'gram8-plural': 1332, 'gram4-superlative': 1122, 'gram5-present-participle': 1056, 'gram1-adjective-to-adverb': 992, 'gram9-plural-verbs': 870, 'currency': 866, 'gram2-opposite': 812, 'capital-common-countries': 506, 'family': 506})
Total correct: 649
Analogies attempted: 19170
Correct_per_category {'capital-world': 8, 'family': 206, 'gram1-adjective-to-adverb': 28, 'gram2-opposite': 89, 'gram3-comparative': 9, 'gram5-present-participle': 77, 'gram7-past-tense': 86, 'gram8-plural': 111, 'gram9-plural-verbs': 26}


In [13]:
similarity_tasks = {
   "WS353" : get_ws353_benchmark(),
   "SimLex999": get_simlex999_benchmark(),
   "RG65": get_RG65_benchmark(),
   "RW": get_RW_benchmark(),
   "MTurk": get_MTurk_benchmark()
}

similarity_results = {}
for task, data in iteritems(similarity_tasks):
    sim_words, sim_scores = data.X, data.y

    human_scores = []
    model_scores = []
    total_checked = 0

    for i, words in enumerate(sim_words):
        word1, word2 = words[0], words[1]
    
        # Check if words exist in embedding dictionary
        try:
            word1_vec = text8_vectors.wv[word1]
            word2_vec = text8_vectors.wv[word2]
            similarity = 10 * cosine_similarity(word1_vec.reshape(1, -1), word2_vec.reshape(1, -1))
            model_similarity = similarity[0][0]     
        except Exception as e:
            continue
            
        total_checked += 1
        # Store the scores
        human_scores.append(sim_scores[i])
        model_scores.append(model_similarity)
        

    model_correlation, p_value = spearmanr(human_scores, model_scores)
    print("-" * 50)
    print(task)
    print('total checked', total_checked, 'out of', len(sim_scores))
    print('Corr:',model_correlation)
    print('P-Val:',p_value)
    
    similarity_results[task] = model_correlation
    
sim = pd.DataFrame([similarity_results])
print(sim)

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


--------------------------------------------------
WS353
total checked 335 out of 353
Corr: 0.6068265046133258
P-Val: 4.4385588329947783e-35
--------------------------------------------------
SimLex999
total checked 999 out of 999
Corr: 0.25072799617794855
P-Val: 8.719873284483485e-16
--------------------------------------------------
RG65
total checked 65 out of 65
Corr: 0.49821333195456446
P-Val: 2.4082963481106378e-05
--------------------------------------------------
RW
total checked 1530 out of 2034
Corr: 0.28203625083558326
P-Val: 2.261693553518623e-29
--------------------------------------------------
MTurk
total checked 286 out of 287
Corr: 0.6015422128422939
P-Val: 1.555016242398753e-29
      WS353  SimLex999      RG65        RW     MTurk
0  0.606827   0.250728  0.498213  0.282036  0.601542


In [17]:
categorization_tasks = {
    "AP": get_ap_benchmark(),
    "BLESS": get_bless_benchmark(),
    "BATTIG": get_battig_benchmark(),
    "ESSLI_2c": get_essli_2c_benchmark(),
    "ESSLI_2b": get_essli_2b_benchmark(),
    "ESSLI_1a": get_essli_1a_benchmark()
}

categorization_results = {}
w = Embedding.from_gensim_word2vec(text8_vectors)

# Calculate results using helper function
for name, data in iteritems(categorization_tasks):
    try: 
        categorization_results[name] = evaluate_categorization(w, data.X, data.y)
    except Exception as e:
        print(name, e)
        continue
    
# Construct pd table
cat = pd.DataFrame([categorization_results])
print(cat)
del categorization_tasks

         AP  BLESS    BATTIG  ESSLI_2c  ESSLI_2b  ESSLI_1a
0  0.460199  0.435  0.226152  0.488889       0.8  0.613636


## Glove Twitter 27B

In [3]:
# # print("Read")
# glove_twitter_kv = load_word_vectors('./data/glove_twitter_27B/glove.twitter.27B.100d_copy.txt')

# # add headers to the file
# with open('./data/glove_twitter_27B/glove.twitter.27B.100d_copy.txt', 'r') as original: data = original.read()

# print("Rewrite")
# dictionary_size = len(glove_twitter_kv.items())
# with open('./data/glove_twitter_27B/glove.twitter.27B.100d.txt', 'w') as modified: modified.write(f"{dictionary_size} 100\n" + data)

print("Open")
# Turn <word> <vector> into a Word2Vec object
glove_twitter_kv = models.KeyedVectors.load_word2vec_format(
    './data/glove_twitter_27B/glove.twitter.27B.100d.txt', binary=False)

# Vocab size
print(glove_twitter_kv)

Open
KeyedVectors<vector_size=100, 1193514 keys>


In [9]:
# questions, answers, category = get_google_analogy_benchmark() # google analogy benchmark
questions, answers, category = get_msr_benchmark() # msr benchmark

subset = range(0, len(questions))
correct_total_count = 0         # Calculate total correct
analogies_attempted = 0
correct_per_category = {}     # Calculate correct per category

for index in tqdm(subset):
    word1, word2, word3 = questions[index]
    answer = answers[index]
    
    try:
        w1_vec = glove_twitter_kv[word1]
        w2_vec = glove_twitter_kv[word2]
        w3_vec = glove_twitter_kv[word3]
        
        analogy_vec = w1_vec - w2_vec + w3_vec
    except Exception as e:
        continue
    
    # Get similar vectors
    similar_vectors = glove_twitter_kv.most_similar(positive=[analogy_vec])
    # Considered correct if answer in top K = 5 of most similar
    correct = True if answer in [word for word, _ in similar_vectors[:5]] else False
        
    analogies_attempted += 1
    if correct:
        correct_total_count += 1
        if category[index] not in correct_per_category:
            correct_per_category[category[index]] = 0
        else:
            correct_per_category[category[index]] += 1

print("Total correct:", correct_total_count)
print("Analogies attempted:", analogies_attempted)
print("Correct_per_category", correct_per_category)

There are 8000 questions
There are 16 categories
Disribution Counter({'jj_jjr': 500, 'jjr_jj': 500, 'jj_jjs': 500, 'jjs_jj': 500, 'jjs_jjr': 500, 'jjr_jjs': 500, 'nn_nns': 500, 'nns_nn': 500, 'nn_nnpos': 500, 'nnpos_nn': 500, 'vb_vbd': 500, 'vbd_vb': 500, 'vb_vbz': 500, 'vbz_vb': 500, 'vbz_vbd': 500, 'vbd_vbz': 500})


100%|██████████| 8000/8000 [07:33<00:00, 17.64it/s] 

Total correct: 326
Analogies attempted: 7682
Correct_per_category {'jjr_jj': 0, 'jj_jjs': 10, 'jjs_jj': 7, 'nn_nns': 121, 'nns_nn': 108, 'nn_nnpos': 24, 'nnpos_nn': 19, 'vbz_vb': 16, 'vb_vbd': 3, 'vbd_vb': 2, 'vb_vbz': 4, 'vbd_vbz': 0}





In [57]:
similarity_tasks = {
   "WS353" : get_ws353_benchmark(),
   "SimLex999": get_simlex999_benchmark(),
   "RG65": get_RG65_benchmark(),
   "RW": get_RW_benchmark(),
   "MTurk": get_MTurk_benchmark()
}

similarity_results = {}
for task, data in iteritems(similarity_tasks):
    sim_words, sim_scores = data.X, data.y

    human_scores, model_scores = [], []
    total_checked = 0

    for i, words in enumerate(sim_words):
        word1, word2 = words[0], words[1]
    
        # Check if words exist in embedding dictionary
        try:
            word1_vec = glove_twitter_kv[word1]
            word2_vec = glove_twitter_kv[word2]
            similarity = 10 * cosine_similarity(word1_vec.reshape(1, -1), word2_vec.reshape(1, -1))
            model_similarity = similarity[0][0]     
        except Exception as e:
            continue
            
        total_checked += 1
        # Store the scores
        human_scores.append(sim_scores[i])
        model_scores.append(model_similarity)
        

    model_correlation, p_value = spearmanr(human_scores, model_scores)
    print("-" * 50)
    print(task)
    print('total checked', total_checked, 'out of', len(sim_scores))
    print('Corr:',model_correlation)
    print('P-Val:',p_value)
    
    similarity_results[task] = model_correlation
    
sim = pd.DataFrame([similarity_results])
print(sim)

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


--------------------------------------------------
WS353
total checked 334 out of 353
Corr: 0.5212324333414307
P-Val: 1.1634914960544721e-24
--------------------------------------------------
SimLex999
total checked 998 out of 999
Corr: 0.12031043234601072
P-Val: 0.00013906433564553764
--------------------------------------------------
RG65
total checked 65 out of 65
Corr: 0.6774486160113895
P-Val: 5.746822577586313e-10
--------------------------------------------------
RW
total checked 1063 out of 2034
Corr: 0.3264945217909137
P-Val: 7.979260997289211e-28
--------------------------------------------------
MTurk
total checked 286 out of 287
Corr: 0.5650167476695827
P-Val: 1.6012296062189186e-25
      WS353  SimLex999      RG65        RW     MTurk
0  0.521232    0.12031  0.677449  0.326495  0.565017


In [14]:
categorization_tasks = {
    "AP": get_ap_benchmark(),
    "BLESS": get_bless_benchmark(),
    "BATTIG": get_battig_benchmark(),
    "ESSLI_2c": get_essli_2c_benchmark(),
    "ESSLI_2b": get_essli_2b_benchmark(),
    "ESSLI_1a": get_essli_1a_benchmark()
}

categorization_results = {}
w = Embedding.from_gensim_keyedvectors(glove_twitter_kv, pretrained=True)

# Calculate results using helper function
for name, data in iteritems(categorization_tasks):
    try: 
        categorization_results[name] = evaluate_categorization(w, data.X, data.y)
    except Exception as e:
        print(name, e)
        continue
    
# Construct pd table
cat = pd.DataFrame([categorization_results])
print(cat)
del categorization_tasks

         AP  BLESS    BATTIG  ESSLI_2c  ESSLI_2b  ESSLI_1a
0  0.462687  0.695  0.318868  0.511111      0.65  0.795455
