In [20]:
import json
import pickle
import pandas as pd
from glob import glob
from pathlib import Path
from scipy import spatial
import scipy.sparse as sp
from itertools import islice
from tqdm.notebook import tqdm
from gensim.models.word2vec import Word2Vec

import sys
sys.path.append('../../')

from src.packages.TPPMI.ppmi_model import PPMIModel
from src.packages.TPPMI.tppmi_model import TPPMIModel

# Imports

In [190]:
path_to_test_data = Path("../../data") / "test"
path_to_tppmi_model = Path("../../data") / "ppmi-matrices" / "nyt-data"
path_to_twec_model = Path("../../model") / "nyt-data" / "cade" / "model"
path_to_static_model = Path("../../model") / "nyt-data" / "static"

# Functions

(to be outsourced later)

## Similarity Calculations

In [246]:
def get_similarities_of_models(model_dict: dict, test_word_dict: dict) -> dict:
    similarities = dict()
    for test_word in tqdm(test_word_dict.items()):
        similarities[test_word[0]] = dict()
        for model in model_dict.items():
            similarities[test_word[0]][model[0].split("_")[1]] = model[1].wv.similar_by_vector(test_word[1])
    return similarities

In [249]:
similarities_test = get_similarities_of_models(cade_models, test_case_dict_1)

  0%|          | 0/497 [00:00<?, ?it/s]

## Evaluation Metrics

In [250]:
def calculate_reciprocal_rank(test_list: list, test_word: str) -> float:
    """
    Calculate the reciprocal rank for a given test word in a list of strings.

    Parameters:
    test_list (list of str): The list of strings to search through.
    test_word (str): The correct answer to find in the test_list.
    Returns:
    float: The reciprocal rank of the test_word in test_list, or 0 if not found.
    """
    try:
        rank = test_list.index(test_word) + 1  # Adding 1 because index is 0-based and rank is 1-based
        return 1.0 / rank
    except ValueError:
        return 0.0  # test_word not found in test_list

In [251]:
def calculate_precision_at_k(test_list: list, test_word: str, k: int) -> int:
    """
    Calculate the precision at K for a given test word in a list of strings.

    Parameters:
    test_list (list of str): The list of strings to search through, assumed to be ordered by relevance.
    test_word (str): The correct answer to find in the test_list.
    k (int): The number of top items to consider for calculating precision.

    Returns:
    int: The precision at K for the test_word in test_list.
         If the target word is among these K words, then the Precision@K for test i
         (denoted P@K[i]) is 1; else, it is 0
    """
    if k <= 0:
        raise ValueError("k must be a positive integer")

    # Take the top K elements from the list
    top_k = test_list[:k]

    # Check if the test_word is within the top K elements
    if test_word in top_k:
        return 1
    else:
        return 0

In [252]:
def calculate_mean_rank(test_key: str, testcase: dict, test_data: pd.DataFrame, metric = "MRR", k = 10)-> float:

    test_data_for_key = test_data[test_data["truth"] == test_key]
    ranks = []

    for key, value in testcase.items():
        test_data_for_year = test_data_for_key[test_data_for_key["equivalent"].str.endswith(key)]
        word_list = [item[0] for item in value]

        if len(test_data_for_year) == 0:
            continue  # Skip if no data for year, as there's nothing to rank
        target_word = test_data_for_year["equivalent"].iloc[0].split("-")[0]
        if metric == "MRR":
            rank = calculate_reciprocal_rank(word_list, target_word)
        else:
            rank = calculate_precision_at_k(word_list, target_word, k)

        ranks.append(rank)

    if ranks:  # Ensure division by 0 does not occur
        mean_rank = sum(ranks) / len(ranks)
    else:
        mean_rank = 0

    return mean_rank

In [253]:
def calculate_rank_metric(similarities: dict, test_data: pd.DataFrame, metric = "MRR", k = 10) -> float:
    ranks = []
    for key, value in similarities.items():
        rank = calculate_mean_rank(key, value, test_data, metric, k)

        ranks.append(rank)

    if ranks:  # Ensure division by 0 does not occur
        mean_rank = sum(ranks) / len(ranks)
    else:
        mean_rank = 0

    return mean_rank

# Setup

## Testsets

### Testset 1

Based on publicly recorded knowledge that for each year lists different names for a particular role, such as U.S. president, U.K. prime minister, NFL superbowl champion team, and so on.

In [254]:
test_data_1 = pd.read_csv(path_to_test_data / "testset_1.csv")

In [255]:
test_data_1.columns = ['truth', 'equivalent']

In [256]:
test_cases_1 = test_data_1['truth'].unique()

In [257]:
test_data_1 = test_data_1.sort_values(by='truth', ascending=True)

In [258]:
print("Testset 1")
print(f"Testcases (all): {len(test_data_1)}")
print(f"Testcases (unique): {len(test_cases_1)}")

Testset 1
Testcases (all): 11027
Testcases (unique): 499


Now we want to split the testset into static & dynamic testcases as was done by Di Carlo et al. in their paper "Training Temporal Word Embeddings with a Compass"

In [259]:
# Splitting the columns into words and years
split_truth = test_data_1['truth'].str.split('-', expand=True)
split_equivalent = test_data_1['equivalent'].str.split('-', expand=True)

# Creating masks for "static" and "dynamic" conditions
static_mask = split_truth[0] == split_equivalent[0]
dynamic_mask = split_truth[0] != split_equivalent[0]

# Applying the masks to create the separate DataFrames
static_df = test_data_1[static_mask]
dynamic_df = test_data_1[dynamic_mask]

In [260]:
static_test_cases = static_df['truth'].unique()
dynamic_test_cases = dynamic_df['truth'].unique()

In [261]:
print("Static")
print(f"Testcases (all): {len(static_df)}")
print(f"Testcases (unique): {len(static_test_cases)}")

Static
Testcases (all): 2937
Testcases (unique): 443


In [262]:
print("Dynamic")
print(f"Testcases (all): {len(dynamic_df)}")
print(f"Testcases (unique): {len(dynamic_test_cases)}")

Dynamic
Testcases (all): 8090
Testcases (unique): 499


### Testset 2

Testset 2 is human-generated, for exploring more interesting concepts like emerging technologies, brands and major events (e.g., disease outbreaks and financial crisis). For constructing the test word pairs, we first select emerging terms which have not been popularized before 1994, then query their well known precedents during 1990 to 1994 (e.g., app-2012 can correspond to software-1990).

In [263]:
test_data_2 = pd.read_csv(path_to_test_data / "testset_2.csv")

In [264]:
test_data_2.columns = ['truth', 'equivalent']

In [265]:
test_cases_2 = test_data_2['truth'].unique()

In [266]:
test_data_2 = test_data_2.sort_values(by='truth', ascending=True)

In [267]:
test_data_2.head()

Unnamed: 0,truth,equivalent
57,amazoncom-2000,walmart-1993
63,amazoncom-2000,macy-1994
62,amazoncom-2000,macy-1993
61,amazoncom-2000,macy-1992
60,amazoncom-2000,macy-1991


In [268]:
print("Testset 2")
print(f"Testcases (all): {len(test_data_2)}")
print(f"Testcases (unique): {len(test_cases_2)}")

Testset 2
Testcases (all): 444
Testcases (unique): 46


## Models

### TWEC

In [269]:
cade_model_filenames = glob(str(path_to_twec_model / "*.model"))

In [270]:
# load models
cade_models = {f"model_{model_file.split('_data')[0][-4:]}":Word2Vec.load(model_file) for model_file in tqdm(cade_model_filenames)}

  0%|          | 0/27 [00:00<?, ?it/s]

In [271]:
cade_models = {model_key: cade_models[model_key] for model_key in sorted(cade_models, key=lambda x: int(x.split('_')[1]))}

In [272]:
cade_models.keys()

dict_keys(['model_1990', 'model_1991', 'model_1992', 'model_1993', 'model_1994', 'model_1995', 'model_1996', 'model_1997', 'model_1998', 'model_1999', 'model_2000', 'model_2001', 'model_2002', 'model_2003', 'model_2004', 'model_2005', 'model_2006', 'model_2007', 'model_2008', 'model_2009', 'model_2010', 'model_2011', 'model_2012', 'model_2013', 'model_2014', 'model_2015', 'model_2016'])

#### Testset 1

In [273]:
test_case_dict_1 = dict()
counter = 0

In [274]:
len(test_cases_1)

499

In [275]:
for test_case in test_cases_1:
    word, year = test_case.split("-")
    ground_model = cade_models[f"model_{year}"]
    if word in ground_model.wv.vocab:
        test_case_dict_1[test_case] = ground_model.wv.get_vector(word)
    else:
        counter = counter + 1
print(f"{counter} Testcases are not in the vocab of the model(s)")

2 Testcases are not in the vocab of the model(s)


In [276]:
cade_model = cade_models[next(iter(cade_models))]

In [277]:
'''
# Takes long to execute, load from memory in next-cell

cade_similarities = get_similarities_of_models(cade_models, test_case_dict_1)
with open(path_to_test_data / 'cade_t1.json', 'w') as f:
    json.dump( cade_similarities, f, indent=4)''';

In [278]:
with open(path_to_test_data / 'cade_t1.json', 'r') as json_file:
    cade_similarities = json.load(json_file)

### Static Word2Vec

In [279]:
from gensim.models import Word2Vec
static_model = Word2Vec.load(str(path_to_static_model / "w2v_model.model"))

### TPPMI

In [280]:
ppmi_data_files = sorted(glob(str(path_to_tppmi_model  / "*.npz")))
words_files = sorted(glob(str(path_to_tppmi_model  / "*.pkl")))

Split context-words from timestamped-vocabularies

In [281]:
context_words_file = [path for path in words_files if "context-words" in path]
ppmi_vocab_files = [path for path in words_files if "context-words" not in path]

In [282]:
# Get ppmi-matrices and vocab
ppmi_matrices = {}

for filenames in zip(ppmi_vocab_files, ppmi_data_files):
    ppmi_matrix = sp.load_npz(filenames[1])
    with open(filenames[0], "rb") as f:
        vocab = pickle.load(f)
    key = filenames[0].split("ppmi-")[2][0:4]
    ppmi_matrices[key] = {"ppmi_matrix" : ppmi_matrix, "vocab": vocab}

# Get common context-words
with open(context_words_file[0], "rb") as f:
    context_words = pickle.load(f)

In [283]:
ppmi_matrices.keys()

dict_keys(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016'])

Create ppmi_model objects

In [284]:
ppmi_models = {key: PPMIModel.construct_from_data(ppmi_data["ppmi_matrix"], ppmi_data["vocab"], context_words) for key, ppmi_data in ppmi_matrices.items()}

In [285]:
tppmi_model = TPPMIModel(ppmi_models, dates="years")

In [286]:
test_list = test_data_1[test_data_1.truth == "bush-1990"].copy()
test_list['year'] = test_list['equivalent'].apply(lambda x: int(x.split('-')[1]))  # Extract year and convert to int
test_list = test_list.sort_values(by='year')  # Sort by the new 'year' column
test_list = test_list.drop('year', axis=1)
print(test_list)

        truth    equivalent
0   bush-1990  clinton-1992
1   bush-1990  clinton-1993
2   bush-1990  clinton-1994
3   bush-1990  clinton-1995
4   bush-1990  clinton-1996
5   bush-1990  clinton-1997
6   bush-1990  clinton-1998
7   bush-1990  clinton-1999
8   bush-1990     bush-2000
9   bush-1990     bush-2001
10  bush-1990     bush-2002
11  bush-1990     bush-2003
12  bush-1990     bush-2004
13  bush-1990     bush-2005
14  bush-1990     bush-2006
15  bush-1990     bush-2007
16  bush-1990    obama-2008
17  bush-1990    obama-2009
18  bush-1990    obama-2010
19  bush-1990    obama-2011
20  bush-1990    obama-2012
21  bush-1990    obama-2013
22  bush-1990    obama-2014
23  bush-1990    obama-2015
24  bush-1990    obama-2016


# Experiment

To examine the quality of embedding alignment, we create a task to query equivalences across years.

For example, given obama-2012, we want to query its equivalent word in 2002. As we know obama is the U.S. president in 2012; its equivalent in 2002 is bush, who was the U.S. president at that time. In this way, we create two testsets.

All results are rounded to three decimal places.

In [287]:
cade_scores = dict()

## Mean Reciprocal Rank (@10)

The Mean Reciprocal Rank (MRR) is a statistical measure used to evaluate the performance of a system that returns a ranked list of responses to queries. It is the average of the reciprocal ranks of the first correct answer for each query, where the reciprocal rank is the inverse of the rank at which the first relevant answer is found.
It is evaluated @10

In [290]:
cade_similarities = similarities_test

In [291]:
mrr_at_10 = round(calculate_rank_metric(cade_similarities, test_data_1, metric='MRR', k=1), 3)
cade_scores["mrr@10"] = mrr_at_10
print(f"MRR of the Cade Model on Testset1: {mrr_at_10}")

MRR of the Cade Model on Testset1: 0.334


## Mean Precision (@K)

As introduced by Yao et al(2018) the MP@K is defined as such: consider the K words most similar to the query embedding for the given year. The Precision@K for a particular test i, represented as P@K[i], equals 1 if the target word appears within this set of K words; otherwise, it assumes a value of 0.

In [292]:
mp_at_1 = round(calculate_rank_metric(cade_similarities, test_data_1, metric='MP', k=1), 3)
cade_scores["mp@1"] = mp_at_1
print(f"MP@1 of the Cade Model on Testset1: {mp_at_1}")

MP@1 of the Cade Model on Testset1: 0.254


In [293]:
mp_at_3 = round(calculate_rank_metric(cade_similarities, test_data_1, metric='MP', k=3), 3)
cade_scores["mp@3"] = mp_at_3
print(f"MP@3 of the Cade Model on Testset1: {mp_at_3}")

MP@3 of the Cade Model on Testset1: 0.387


In [294]:
mp_at_5 = round(calculate_rank_metric(cade_similarities, test_data_1, metric='MP', k=5), 3)
cade_scores["mp@5"] = mp_at_5
print(f"MP@5 of the Cade Model on Testset1: {mp_at_5}")

MP@5 of the Cade Model on Testset1: 0.443


In [295]:
mp_at_10 = round(calculate_rank_metric(cade_similarities, test_data_1, metric='MP', k=10), 3)
cade_scores["mp@10"] = mp_at_10
print(f"MP@5 of the Cade Model on Testset1: {mp_at_10}")

MP@5 of the Cade Model on Testset1: 0.505


In [296]:
print(json.dumps(cade_scores, indent=4))

{
    "mrr@10": 0.334,
    "mp@1": 0.254,
    "mp@3": 0.387,
    "mp@5": 0.443,
    "mp@10": 0.505
}


# Scrapyard

In [201]:
def calculate_mean_reciprocal_rank_test(test_key: str, testcase: dict, test_data: pd.DataFrame):
    print(test_key)
    test_data_for_key = test_data[test_data["truth"] == test_key]
    print("test_data_for_key")
    print(f"length: {len(test_data_for_key)}")
    print(test_data_for_key)
    ranks = []
    for key, value in testcase.items():
        print(f"Key: {key}")
        test_data_for_year = test_data_for_key[test_data_for_key["equivalent"].str.endswith(key)]
        word_list = [item[0] for item in value]
        print("WORD-LIST")
        print(word_list)
        if len(test_data_for_year) == 0:
            print("CONTINUE")
            continue # this means that it is the same year as the word we want to test --> no need to calculate
        print("test_data_for_year")
        print(test_data_for_year)
        print(test_data_for_year["equivalent"].iloc[0].split("-")[0])
        reciprocal_rank = calculate_reciprocal_rank(word_list, test_data_for_year["equivalent"].iloc[0].split("-")[0])
        print(reciprocal_rank)
        ranks.append(reciprocal_rank)

    if ranks:  # Ensure division by zero does not occur
        mrr = sum(ranks) / len(ranks)
    else:
        mrr = 0
    return mrr

In [141]:
def calculate_mrr_for_key(data_dict, key):
    """
    Calculate the Mean Reciprocal Rank (MRR) for a given key in the data dictionary.

    Parameters:
    data_dict (dict): The dictionary containing years as keys and lists of word-score pairs as values.
    key (str): The key in the dictionary to calculate MRR for. Assumes format 'word-year'.

    Returns:
    float: The MRR for the given key.
    """
    test_word = key.split('-')[0]  # Assuming the "test word" is the part of the key before the hyphen
    total_reciprocal_rank = 0
    num_years = 0

    for year, word_score_pairs in data_dict[key].items():
        for rank, (word, score) in enumerate(word_score_pairs, start=1):
            if word == test_word:
                total_reciprocal_rank += 1.0 / rank
                break  # Stop looking once the first instance of the test word is found
        num_years += 1

    # Calculate MRR
    if num_years > 0:
        return total_reciprocal_rank / num_years
    else:
        return 0.0  # Return 0 if there are no years/data to calculate MRR

In [None]:
def get_similarities_of_model_manual(model, test_word, top_n = 10) -> list:
    # Compute cosine similarity between specified embedding and all embeddings in the model
    test_word_embedding = test_word[1]
    word_similarities = dict()
    for reference_word in model.wv.vocab:
        reference_word_embedding = model.wv[reference_word]
        similarity = 1 - spatial.distance.cosine(test_word_embedding, reference_word_embedding)
        word_similarities[reference_word] = similarity

    # Sort words by similarity
    sorted_similarities = sorted(word_similarities.items(), key=lambda item: item[1], reverse=True)

    print("Sorted Similarities")
    print(sorted_similarities[:top_n])

    # Get top_n similar words
    return sorted_similarities[:top_n]