In [1]:
import json
import pickle

import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
from scipy import spatial
import scipy.sparse as sp
from itertools import islice
from tqdm.notebook import tqdm
from gensim.models.word2vec import Word2Vec

import sys
sys.path.append('../../')

from src.packages.TPPMI.ppmi_model import PPMIModel
from src.packages.TPPMI.tppmi_model import TPPMIModel

# Imports

In [2]:
path_to_test_data = Path("../../data") / "test"
path_to_tppmi_model = Path("../../data") / "ppmi-matrices" / "nyt-data"
path_to_twec_model = Path("../../model") / "nyt-data" / "cade" / "model"
path_to_static_model = Path("../../model") / "nyt-data" / "static"

# Functions

(to be outsourced later)

## Similarity Calculations

In [3]:
def create_test_case_dict_cade(test_cases : np.ndarray, models: dict) -> dict:
    test_case_dict = dict()
    counter = 0
    for test_case in test_cases:
        word, year = test_case.split("-")
        ground_model = models[f"model_{year}"]
        if word in ground_model.wv.vocab:
            test_case_dict[test_case] = ground_model.wv.get_vector(word)
        else:
            counter = counter + 1
    print(f"{counter} Testcases are not in the vocab of the model(s)")
    return test_case_dict

In [4]:
def create_test_case_dict_static(model, test_cases : np.ndarray) -> dict:
    test_case_dict = dict()
    counter = 0
    for test_case in test_cases:
        word, year = test_case.split("-")
        if word in model.wv.vocab:
            test_case_dict[test_case] = model.wv.get_vector(word)
        else:
            counter = counter + 1
    print(f"{counter} Testcases are not in the vocab of the model")
    return test_case_dict

In [5]:
def get_similarities_of_models(model_dict: dict, test_word_dict: dict) -> dict:
    similarities = dict()
    for test_word in tqdm(test_word_dict.items()):
        similarities[test_word[0]] = dict()
        for model in model_dict.items():
            similarities[test_word[0]][model[0].split("_")[1]] = model[1].wv.similar_by_vector(test_word[1])
    return similarities

In [6]:
def get_similarities_of_models_static(model, test_word_dict: dict) -> dict:
    similarities = dict()
    for test_word in tqdm(test_word_dict.items()):
        similarities[test_word[0]] = model.wv.similar_by_vector(test_word[1])
    return similarities

## Evaluation Metrics

In [7]:
def calculate_reciprocal_rank(test_list: list, test_word: str) -> float:
    """
    Calculate the reciprocal rank for a given test word in a list of strings.

    Parameters:
    test_list (list of str): The list of strings to search through.
    test_word (str): The correct answer to find in the test_list.
    Returns:
    float: The reciprocal rank of the test_word in test_list, or 0 if not found.
    """
    try:
        rank = test_list.index(test_word) + 1  # Adding 1 because index is 0-based and rank is 1-based
        return 1.0 / rank
    except ValueError:
        return 0.0  # test_word not found in test_list

In [8]:
def calculate_precision_at_k(test_list: list, test_word: str, k: int) -> int:
    """
    Calculate the precision at K for a given test word in a list of strings.

    Parameters:
    test_list (list of str): The list of strings to search through, assumed to be ordered by relevance.
    test_word (str): The correct answer to find in the test_list.
    k (int): The number of top items to consider for calculating precision.

    Returns:
    int: The precision at K for the test_word in test_list.
         If the target word is among these K words, then the Precision@K for test i
         (denoted P@K[i]) is 1; else, it is 0
    """
    if k <= 0:
        raise ValueError("k must be a positive integer")

    # Take the top K elements from the list
    top_k = test_list[:k]

    # Check if the test_word is within the top K elements
    if test_word in top_k:
        return 1
    else:
        return 0

In [9]:
def calculate_mean_rank(test_key: str, testcase: dict, test_data: pd.DataFrame, metric = "MRR", k = 10)-> float:

    test_data_for_key = test_data[test_data["truth"] == test_key]
    ranks = []

    for key, value in testcase.items():
        test_data_for_year = test_data_for_key[test_data_for_key["equivalent"].str.endswith(key)]
        word_list = [item[0] for item in value]

        if len(test_data_for_year) == 0:
            continue  # Skip if no data for year, as there's nothing to rank
        target_word = test_data_for_year["equivalent"].iloc[0].split("-")[0]
        if metric == "MRR":
            rank = calculate_reciprocal_rank(word_list, target_word)
        else:
            rank = calculate_precision_at_k(word_list, target_word, k)

        ranks.append(rank)

    if ranks:  # Ensure division by 0 does not occur
        mean_rank = sum(ranks) / len(ranks)
    else:
        mean_rank = 0

    return mean_rank

In [10]:
def calculate_rank_metric(similarities: dict, test_data: pd.DataFrame, metric = "MRR", k = 10) -> float:
    ranks = []
    for key, value in similarities.items():
        rank = calculate_mean_rank(key, value, test_data, metric, k)

        ranks.append(rank)

    if ranks:  # Ensure division by 0 does not occur
        mean_rank = sum(ranks) / len(ranks)
    else:
        mean_rank = 0

    return mean_rank

In [93]:
def calculate_rank_metric_static(similarities: dict, test_data: pd.DataFrame, metric = "MRR", k = 10) -> float:

    ranks = []
    counter = 0

    for key, value in similarities.items():
        test_data_for_key = test_data[test_data["truth"] == key]
        word_list = [item[0] for item in value]
        for _, test_case in test_data_for_key.iterrows():
            target_word = test_case["equivalent"].split("-")[0]
            if metric == "MRR":
                rank = calculate_reciprocal_rank(word_list, target_word)
            else:
                rank = calculate_precision_at_k(word_list, target_word, k)
            ranks.append(rank)
            counter = counter + 1


    if ranks:  # Ensure division by 0 does not occur
        mean_rank = sum(ranks) / len(ranks)
    else:
        mean_rank = 0

    return mean_rank

# Setup

## Testsets

### Testset 1

Based on publicly recorded knowledge that for each year lists different names for a particular role, such as U.S. president, U.K. prime minister, NFL superbowl champion team, and so on.

In [13]:
test_data_1_all = pd.read_csv(path_to_test_data / "testset_1.csv")

In [14]:
test_data_1_all.columns = ['truth', 'equivalent']

In [15]:
test_data_1_all = test_data_1_all.sort_values(by='truth', ascending=True)

In [16]:
test_cases_1_all = test_data_1_all['truth'].unique()

In [17]:
print("Testset 1")
print(f"Testcases (all): {len(test_data_1_all)}")
print(f"Testcases (unique): {len(test_cases_1_all)}")

Testset 1
Testcases (all): 11027
Testcases (unique): 499


Now we want to split the testset into static & dynamic testcases as was done by Di Carlo et al. in their paper "Training Temporal Word Embeddings with a Compass"

In [18]:
# Splitting the columns into words and years
split_truth = test_data_1_all['truth'].str.split('-', expand=True)
split_equivalent = test_data_1_all['equivalent'].str.split('-', expand=True)

# Creating masks for "static" and "dynamic" conditions
static_mask = split_truth[0] == split_equivalent[0]
dynamic_mask = split_truth[0] != split_equivalent[0]

# Applying the masks to create the separate DataFrames
test_data_1_static = test_data_1_all[static_mask]
test_data_1_dynamic = test_data_1_all[dynamic_mask]

In [19]:
test_cases_1_static = test_data_1_static['truth'].unique()
test_cases_1_dynamic = test_data_1_dynamic['truth'].unique()

In [20]:
print("Static")
print(f"Testcases (all): {len(test_data_1_static)}")
print(f"Testcases (unique): {len(test_cases_1_static)}")

Static
Testcases (all): 2937
Testcases (unique): 443


In [21]:
print("Dynamic")
print(f"Testcases (all): {len(test_data_1_dynamic)}")
print(f"Testcases (unique): {len(test_cases_1_dynamic)}")

Dynamic
Testcases (all): 8090
Testcases (unique): 499


### Testset 2

Testset 2 is human-generated, for exploring more interesting concepts like emerging technologies, brands and major events (e.g., disease outbreaks and financial crisis). For constructing the test word pairs, we first select emerging terms which have not been popularized before 1994, then query their well known precedents during 1990 to 1994 (e.g., app-2012 can correspond to software-1990).

In [22]:
test_data_2 = pd.read_csv(path_to_test_data / "testset_2.csv")

In [23]:
test_data_2.columns = ['truth', 'equivalent']

In [24]:
test_cases_2 = test_data_2['truth'].unique()

In [25]:
test_data_2 = test_data_2.sort_values(by='truth', ascending=True)

In [26]:
test_data_2.head()

Unnamed: 0,truth,equivalent
57,amazoncom-2000,walmart-1993
63,amazoncom-2000,macy-1994
62,amazoncom-2000,macy-1993
61,amazoncom-2000,macy-1992
60,amazoncom-2000,macy-1991


In [27]:
print("Testset 2")
print(f"Testcases (all): {len(test_data_2)}")
print(f"Testcases (unique): {len(test_cases_2)}")

Testset 2
Testcases (all): 444
Testcases (unique): 46


## Models

### TWEC

In [28]:
model_filenames_cade = glob(str(path_to_twec_model / "*.model"))

In [29]:
# load models
models_cade = {f"model_{model_file.split('_data')[0][-4:]}":Word2Vec.load(model_file) for model_file in tqdm(model_filenames_cade)}

  0%|          | 0/27 [00:00<?, ?it/s]

In [30]:
models_cade = {model_key: models_cade[model_key] for model_key in sorted(models_cade, key=lambda x: int(x.split('_')[1]))}

In [31]:
models_cade.keys()

dict_keys(['model_1990', 'model_1991', 'model_1992', 'model_1993', 'model_1994', 'model_1995', 'model_1996', 'model_1997', 'model_1998', 'model_1999', 'model_2000', 'model_2001', 'model_2002', 'model_2003', 'model_2004', 'model_2005', 'model_2006', 'model_2007', 'model_2008', 'model_2009', 'model_2010', 'model_2011', 'model_2012', 'model_2013', 'model_2014', 'model_2015', 'model_2016'])

Create dictionary of testsets that contain all test-words along with their embedding in the respective year

In [32]:
test_case_dict_cade_all = create_test_case_dict_cade(test_cases_1_all, models_cade)

2 Testcases are not in the vocab of the model(s)


In [33]:
test_case_dict_cade_static = create_test_case_dict_cade(test_cases_1_static, models_cade)

2 Testcases are not in the vocab of the model(s)


In [34]:
test_case_dict_cade_dynamic = create_test_case_dict_cade(test_cases_1_dynamic, models_cade)

2 Testcases are not in the vocab of the model(s)


Retrieve most similar words for each testword in each year

In [35]:
similarities_cade_all = get_similarities_of_models(models_cade, test_case_dict_cade_all)

  0%|          | 0/497 [00:00<?, ?it/s]

In [36]:
similarities_cade_static = get_similarities_of_models(models_cade, test_case_dict_cade_static)

  0%|          | 0/441 [00:00<?, ?it/s]

In [37]:
similarities_cade_dynamic = get_similarities_of_models(models_cade, test_case_dict_cade_dynamic)

  0%|          | 0/497 [00:00<?, ?it/s]

### Static Word2Vec

In [38]:
model_static = Word2Vec.load(str(path_to_static_model / "w2v_model.model"))

In [39]:
test_case_dict_static_all = create_test_case_dict_static(model_static, test_cases_1_all)

0 Testcases are not in the vocab of the model


In [40]:
test_case_dict_static_static = create_test_case_dict_static(model_static, test_cases_1_static)

0 Testcases are not in the vocab of the model


In [41]:
test_case_dict_static_dynamic = create_test_case_dict_static(model_static, test_cases_1_dynamic)

0 Testcases are not in the vocab of the model


In [42]:
test_case_dict_static_all

{'49ers-1990': array([-1.3509833 ,  1.5813793 ,  1.5203751 , -0.26775545,  0.653239  ,
         0.66569066, -0.3561276 ,  2.351575  , -2.0535777 ,  0.07361615,
         0.8612238 ,  1.2791396 , -1.7889788 ,  3.1717565 , -1.0927444 ,
        -0.11742842, -0.5277771 , -0.22165386,  1.1616123 , -2.0924761 ,
         0.09479195, -3.0151434 , -0.0677463 , -2.0772352 , -1.0678693 ,
        -0.24881327,  0.21123794,  0.2791202 , -0.6145486 , -1.746779  ,
        -1.3856107 , -0.7327089 , -3.6868267 , -0.5906266 ,  1.6431425 ,
        -0.6785532 , -0.98610675,  0.9487777 ,  1.6457883 , -0.15079555,
        -0.5570679 ,  0.12505175,  1.6398933 , -1.1718907 ,  0.9152497 ,
         0.1758884 ,  0.9599068 , -0.91914797, -1.8983663 , -0.75175   ,
        -0.83106846,  0.28722718, -2.5703707 ,  0.7375135 , -0.47523916,
         2.781508  , -0.49144587, -1.0849645 ,  1.1245705 , -0.06963482,
         1.0085394 ,  0.09480161, -0.40199113, -1.557133  ,  2.008354  ,
        -0.48916528, -0.70392615, -0.

In [43]:
similarities_static_all = get_similarities_of_models_static(model_static, test_case_dict_static_all)

  0%|          | 0/499 [00:00<?, ?it/s]

In [44]:
similarities_static_static = get_similarities_of_models_static(model_static, test_case_dict_static_static)

  0%|          | 0/443 [00:00<?, ?it/s]

In [45]:
similarities_static_dynamic = get_similarities_of_models_static(model_static, test_case_dict_static_dynamic)

  0%|          | 0/499 [00:00<?, ?it/s]

### TPPMI

In [47]:
ppmi_data_files = sorted(glob(str(path_to_tppmi_model  / "*.npz")))
words_files = sorted(glob(str(path_to_tppmi_model  / "*.pkl")))

Split context-words from timestamped-vocabularies

In [48]:
context_words_file = [path for path in words_files if "context-words" in path]
ppmi_vocab_files = [path for path in words_files if "context-words" not in path]

In [49]:
# Get ppmi-matrices and vocab
ppmi_matrices = {}

for filenames in zip(ppmi_vocab_files, ppmi_data_files):
    ppmi_matrix = sp.load_npz(filenames[1])
    with open(filenames[0], "rb") as f:
        vocab = pickle.load(f)
    key = filenames[0].split("ppmi-")[2][0:4]
    ppmi_matrices[key] = {"ppmi_matrix" : ppmi_matrix, "vocab": vocab}

# Get common context-words
with open(context_words_file[0], "rb") as f:
    context_words = pickle.load(f)

In [50]:
ppmi_matrices.keys()

dict_keys(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016'])

Create ppmi_model objects

In [51]:
ppmi_models = {key: PPMIModel.construct_from_data(ppmi_data["ppmi_matrix"], ppmi_data["vocab"], context_words) for key, ppmi_data in ppmi_matrices.items()}

In [52]:
tppmi_model = TPPMIModel(ppmi_models, dates="years")

In [55]:
test_list = test_data_1_all[test_data_1_all.truth == "bush-1990"].copy()
test_list['year'] = test_list['equivalent'].apply(lambda x: int(x.split('-')[1]))  # Extract year and convert to int
test_list = test_list.sort_values(by='year')  # Sort by the new 'year' column
test_list = test_list.drop('year', axis=1)
# print(test_list)

# Experiment

To examine the quality of embedding alignment, we create a task to query equivalences across years.

For example, given obama-2012, we want to query its equivalent word in 2002. As we know obama is the U.S. president in 2012; its equivalent in 2002 is bush, who was the U.S. president at that time. In this way, we create two testsets.

All results are rounded to three decimal places.

In [77]:
cutoffs = [1, 3, 5, 10]
list_of_types = ["all", "static", "dynamic"]

list_of_data_cade = [[similarities_cade_all, test_data_1_all], [similarities_cade_static, test_data_1_static],
                     [similarities_cade_dynamic, test_data_1_dynamic]]
list_of_data_static = [[similarities_static_all, test_data_1_all], [similarities_static_static, test_data_1_static],
                       [similarities_static_dynamic, test_data_1_dynamic]]

config_dict_cade = {key: value for key, value in zip(list_of_types, list_of_data_cade)}
config_dict_static = {key: value for key, value in zip(list_of_types, list_of_data_static)}

scores_cade = {key: dict() for key in list_of_types}
scores_static = {key: dict() for key in list_of_types}

## Mean Reciprocal Rank (@10)

The Mean Reciprocal Rank (MRR) is a statistical measure used to evaluate the performance of a system that returns a ranked list of responses to queries. It is the average of the reciprocal ranks of the first correct answer for each query, where the reciprocal rank is the inverse of the rank at which the first relevant answer is found.
It is evaluated @10

### TWEC

In [94]:
for key, value in tqdm(config_dict_cade.items()):
    scores_cade[key]["mrr@10"] = round(calculate_rank_metric(value[0], value[1], metric='MRR'), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

### Static Word2Vec

In [95]:
for key, value in tqdm(config_dict_static.items()):
    scores_static[key]["mrr@10"] = round(calculate_rank_metric_static(value[0], value[1], metric='MRR'), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

## Mean Precision (@K)

As introduced by Yao et al.(2018) the MP@K is defined as such: consider the K words most similar to the query embedding for the given year. The Precision@K for a particular test i, represented as P@K[i], equals 1 if the target word appears within this set of K words; otherwise, it assumes a value of 0.

### TWEC

In [96]:
for key, value in tqdm(config_dict_cade.items()):
    for k in cutoffs:
        score_key = f"mp@{k}"
        scores_cade[key][score_key] = round(calculate_rank_metric(value[0], value[1], metric='MP', k=k), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

### Static Word2Vec

In [97]:
for key, value in tqdm(config_dict_static.items()):
    for k in cutoffs:
        score_key = f"mp@{k}"
        scores_static[key][score_key] = round(calculate_rank_metric_static(value[0], value[1], metric='MP', k=k), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

## Results

In [98]:
score_table_cade = pd.DataFrame(scores_cade).T
score_table_static = pd.DataFrame(scores_static).T
print("Scores of TWEC")
display(score_table_cade)
print("Scores of Static Word2Vec (Baseline)")
display(score_table_static)

Scores of TWEC


Unnamed: 0,mrr@10,mp@1,mp@3,mp@5,mp@10
all,0.334,0.254,0.387,0.443,0.505
static,0.508,0.42,0.569,0.629,0.69
dynamic,0.291,0.213,0.343,0.4,0.462


Scores of Static Word2Vec (Baseline)


Unnamed: 0,mrr@10,mp@1,mp@3,mp@5,mp@10
all,0.374,0.266,0.442,0.539,0.606
static,1.0,1.0,1.0,1.0,1.0
dynamic,0.147,0.0,0.24,0.372,0.463


----------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------

# Scrapyard

In [None]:
def calculate_mean_reciprocal_rank_test(test_key: str, testcase: dict, test_data: pd.DataFrame):
    print(test_key)
    test_data_for_key = test_data[test_data["truth"] == test_key]
    print("test_data_for_key")
    print(f"length: {len(test_data_for_key)}")
    print(test_data_for_key)
    ranks = []
    for key, value in testcase.items():
        print(f"Key: {key}")
        test_data_for_year = test_data_for_key[test_data_for_key["equivalent"].str.endswith(key)]
        word_list = [item[0] for item in value]
        print("WORD-LIST")
        print(word_list)
        if len(test_data_for_year) == 0:
            print("CONTINUE")
            continue # this means that it is the same year as the word we want to test --> no need to calculate
        print("test_data_for_year")
        print(test_data_for_year)
        print(test_data_for_year["equivalent"].iloc[0].split("-")[0])
        reciprocal_rank = calculate_reciprocal_rank(word_list, test_data_for_year["equivalent"].iloc[0].split("-")[0])
        print(reciprocal_rank)
        ranks.append(reciprocal_rank)

    if ranks:  # Ensure division by zero does not occur
        mrr = sum(ranks) / len(ranks)
    else:
        mrr = 0
    return mrr

In [None]:
def calculate_mrr_for_key(data_dict, key):
    """
    Calculate the Mean Reciprocal Rank (MRR) for a given key in the data dictionary.

    Parameters:
    data_dict (dict): The dictionary containing years as keys and lists of word-score pairs as values.
    key (str): The key in the dictionary to calculate MRR for. Assumes format 'word-year'.

    Returns:
    float: The MRR for the given key.
    """
    test_word = key.split('-')[0]  # Assuming the "test word" is the part of the key before the hyphen
    total_reciprocal_rank = 0
    num_years = 0

    for year, word_score_pairs in data_dict[key].items():
        for rank, (word, score) in enumerate(word_score_pairs, start=1):
            if word == test_word:
                total_reciprocal_rank += 1.0 / rank
                break  # Stop looking once the first instance of the test word is found
        num_years += 1

    # Calculate MRR
    if num_years > 0:
        return total_reciprocal_rank / num_years
    else:
        return 0.0  # Return 0 if there are no years/data to calculate MRR

In [None]:
def get_similarities_of_model_manual(model, test_word, top_n = 10) -> list:
    # Compute cosine similarity between specified embedding and all embeddings in the model
    test_word_embedding = test_word[1]
    word_similarities = dict()
    for reference_word in model.wv.vocab:
        reference_word_embedding = model.wv[reference_word]
        similarity = 1 - spatial.distance.cosine(test_word_embedding, reference_word_embedding)
        word_similarities[reference_word] = similarity

    # Sort words by similarity
    sorted_similarities = sorted(word_similarities.items(), key=lambda item: item[1], reverse=True)

    print("Sorted Similarities")
    print(sorted_similarities[:top_n])

    # Get top_n similar words
    return sorted_similarities[:top_n]