In [197]:
import json
import pickle
import pandas as pd
from glob import glob
from pathlib import Path
from scipy import spatial
import scipy.sparse as sp
from itertools import islice
from tqdm.notebook import tqdm
from gensim.models.word2vec import Word2Vec

import sys
sys.path.append('../../')

from src.packages.TPPMI.ppmi_model import PPMIModel
from src.packages.TPPMI.tppmi_model import TPPMIModel

# Imports

In [82]:
path_to_test_data = Path("../../data") / "test"
path_to_tppmi_model = Path("../../data") / "ppmi-matrices" / "nyt-data"
path_to_twec_model = Path("../../model") / "nyt-data" / "cade" / "model"

## Testsets

### Testset 1

Based on publicly recorded knowledge that for each year lists different names for a particular role, such as U.S. president, U.K. prime minister, NFL superbowl champion team, and so on.

In [83]:
test_data_1 = pd.read_csv(path_to_test_data / "testset_1.csv")

In [84]:
test_data_1.columns = ['truth', 'equivalent']

In [85]:
test_cases_1 = test_data_1['truth'].unique()

In [86]:
test_data_1 = test_data_1.sort_values(by='truth', ascending=True)

In [87]:
print(f"Testset1 contains {len(test_cases_1)} test cases")

Testset1 contains 499 test cases


### Testset 2

Testset 2 is human-generated, for exploring more interesting concepts like emerging technologies, brands and major events (e.g., disease outbreaks and financial crisis). For constructing the test word pairs, we first select emerging terms which have not been popularized before 1994, then query their well known precedents during 1990 to 1994 (e.g., app-2012 can correspond to software-1990).

In [88]:
test_data_2 = pd.read_csv(path_to_test_data / "testset_2.csv")

In [89]:
test_data_2.columns = ['truth', 'equivalent']

In [90]:
test_data_2 = test_data_2.sort_values(by='truth', ascending=True)

In [91]:
test_data_2.head()

Unnamed: 0,truth,equivalent
57,amazoncom-2000,walmart-1993
63,amazoncom-2000,macy-1994
62,amazoncom-2000,macy-1993
61,amazoncom-2000,macy-1992
60,amazoncom-2000,macy-1991


## Models

### TWEC

In [92]:
cade_model_filenames = glob(str(path_to_twec_model / "*.model"))

In [93]:
path_to_twec_model

PosixPath('../../model/nyt-data/cade/model')

In [94]:
cade_model_filenames

['../../model/nyt-data/cade/model/1994_data.model',
 '../../model/nyt-data/cade/model/2000_data.model',
 '../../model/nyt-data/cade/model/1999_data.model',
 '../../model/nyt-data/cade/model/2011_data.model',
 '../../model/nyt-data/cade/model/2008_data.model',
 '../../model/nyt-data/cade/model/2014_data.model',
 '../../model/nyt-data/cade/model/2005_data.model',
 '../../model/nyt-data/cade/model/1991_data.model',
 '../../model/nyt-data/cade/model/1996_data.model',
 '../../model/nyt-data/cade/model/2002_data.model',
 '../../model/nyt-data/cade/model/2013_data.model',
 '../../model/nyt-data/cade/model/2016_data.model',
 '../../model/nyt-data/cade/model/2007_data.model',
 '../../model/nyt-data/cade/model/1993_data.model',
 '../../model/nyt-data/cade/model/2009_data.model',
 '../../model/nyt-data/cade/model/2015_data.model',
 '../../model/nyt-data/cade/model/1990_data.model',
 '../../model/nyt-data/cade/model/2004_data.model',
 '../../model/nyt-data/cade/model/2001_data.model',
 '../../mode

In [95]:
# load models
cade_models = {f"model_{model_file.split('_data')[0][-4:]}":Word2Vec.load(model_file) for model_file in tqdm(cade_model_filenames)}

  0%|          | 0/27 [00:00<?, ?it/s]

In [96]:
cade_models = {model_key: cade_models[model_key] for model_key in sorted(cade_models, key=lambda x: int(x.split('_')[1]))}

In [97]:
cade_models.keys()

dict_keys(['model_1990', 'model_1991', 'model_1992', 'model_1993', 'model_1994', 'model_1995', 'model_1996', 'model_1997', 'model_1998', 'model_1999', 'model_2000', 'model_2001', 'model_2002', 'model_2003', 'model_2004', 'model_2005', 'model_2006', 'model_2007', 'model_2008', 'model_2009', 'model_2010', 'model_2011', 'model_2012', 'model_2013', 'model_2014', 'model_2015', 'model_2016'])

#### Testset 1

In [98]:
test_case_dict_1 = dict()
counter = 0

In [99]:
len(test_cases_1)

499

In [100]:
for test_case in test_cases_1:
    word, year = test_case.split("-")
    ground_model = cade_models[f"model_{year}"]
    if word in ground_model.wv.vocab:
        test_case_dict_1[test_case] = ground_model.wv.get_vector(word)
    else:
        counter = counter + 1
print(counter)

2


In [101]:
cade_model = cade_models[next(iter(cade_models))]

In [102]:
def get_similarities_of_model(model, test_word, top_n = 10):
    # Compute cosine similarity between specified embedding and all embeddings in the model
    test_word_embedding = test_word[1]
    test_word_key = test_word[0]
    word_similarities = dict()
    for reference_word in model.wv.vocab:
        reference_word_embedding = model.wv[reference_word]
        similarity = 1 - spatial.distance.cosine(test_word_embedding, reference_word_embedding)
        word_similarities[reference_word] = similarity

    # Sort words by similarity
    sorted_similarities = sorted(word_similarities.items(), key=lambda item: item[1], reverse=True)

    # Get top_n similar words
    return sorted_similarities[:top_n]

In [103]:
def get_similarities_of_models(model_dict: dict, test_word_dict: dict):
    similarities = dict()
    for test_word in tqdm(test_word_dict.items()):
        similarities[test_word[0]] = dict()
        for model in model_dict.items():
            similarities[test_word[0]][model[0].split("_")[1]] = get_similarities_of_model(model[1], test_word)
    return similarities

In [104]:
'''
# Takes long to execute
cade_similarities = get_similarities_of_models(cade_models, test_case_dict_1)
with open(path_to_test_data / 'cade_t1.json', 'w') as f:
    json.dump( cade_similarities, f, indent=4)''';

In [105]:
with open(path_to_test_data / 'cade_t1.json', 'r') as json_file:
    cade_similarities = json.load(json_file)

In [141]:
first_key = next(iter(cade_similarities))
first = cade_similarities[next(iter(cade_similarities))]

In [142]:
first_key

'bush-1990'

In [143]:
first

{'1990': [['bush', 1],
  ['bushs', 0.866192638874054],
  ['gorbachev', 0.8211690783500671],
  ['lithuania', 0.7609381079673767],
  ['conservatives', 0.7489315867424011],
  ['hussein', 0.7443727254867554],
  ['shamir', 0.734448254108429],
  ['gorbachevs', 0.7306549549102783],
  ['stalemate', 0.7097168564796448],
  ['husseins', 0.7055761814117432]],
 '1991': [['bush', 0.948120653629303],
  ['bushs', 0.8322234153747559],
  ['sununu', 0.8044367432594299],
  ['gorbachev', 0.7639114856719971],
  ['gorbachevs', 0.760785698890686],
  ['hosni', 0.7602129578590393],
  ['mubarak', 0.7572333812713623],
  ['baker', 0.75714510679245],
  ['kemp', 0.7527531385421753],
  ['yeltsin', 0.7031477689743042]],
 '1992': [['bush', 0.9445867538452148],
  ['bushs', 0.8651612401008606],
  ['clinton', 0.7914848327636719],
  ['yeltsin', 0.7668893933296204],
  ['clintons', 0.7362542748451233],
  ['reelected', 0.7336463928222656],
  ['perot', 0.7271304130554199],
  ['mubarak', 0.7240527272224426],
  ['aides', 0.71862

In [107]:
for key, value in cade_similarities.items():
    print(key)
    print(value)
    break

bush-1990
{'1990': [['bush', 1], ['bushs', 0.866192638874054], ['gorbachev', 0.8211690783500671], ['lithuania', 0.7609381079673767], ['conservatives', 0.7489315867424011], ['hussein', 0.7443727254867554], ['shamir', 0.734448254108429], ['gorbachevs', 0.7306549549102783], ['stalemate', 0.7097168564796448], ['husseins', 0.7055761814117432]], '1991': [['bush', 0.948120653629303], ['bushs', 0.8322234153747559], ['sununu', 0.8044367432594299], ['gorbachev', 0.7639114856719971], ['gorbachevs', 0.760785698890686], ['hosni', 0.7602129578590393], ['mubarak', 0.7572333812713623], ['baker', 0.75714510679245], ['kemp', 0.7527531385421753], ['yeltsin', 0.7031477689743042]], '1992': [['bush', 0.9445867538452148], ['bushs', 0.8651612401008606], ['clinton', 0.7914848327636719], ['yeltsin', 0.7668893933296204], ['clintons', 0.7362542748451233], ['reelected', 0.7336463928222656], ['perot', 0.7271304130554199], ['mubarak', 0.7240527272224426], ['aides', 0.7186276912689209], ['courtship', 0.71063482761383

In [108]:
first;

### TPPMI

In [109]:
ppmi_data_files = sorted(glob(str(path_to_tppmi_model  / "*.npz")))
words_files = sorted(glob(str(path_to_tppmi_model  / "*.pkl")))

Split context-words from timestamped-vocabularies

In [110]:
context_words_file = [path for path in words_files if "context-words" in path]
ppmi_vocab_files = [path for path in words_files if "context-words" not in path]

In [111]:
# Get ppmi-matrices and vocab
ppmi_matrices = {}

for filenames in zip(ppmi_vocab_files, ppmi_data_files):
    ppmi_matrix = sp.load_npz(filenames[1])
    with open(filenames[0], "rb") as f:
        vocab = pickle.load(f)
    key = filenames[0].split("ppmi-")[2][0:4]
    ppmi_matrices[key] = {"ppmi_matrix" : ppmi_matrix, "vocab": vocab}

# Get common context-words
with open(context_words_file[0], "rb") as f:
    context_words = pickle.load(f)

In [112]:
ppmi_matrices.keys()

dict_keys(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016'])

Create ppmi_model objects

In [113]:
ppmi_models = {key: PPMIModel.construct_from_data(ppmi_data["ppmi_matrix"], ppmi_data["vocab"], context_words) for key, ppmi_data in ppmi_matrices.items()}

In [114]:
tppmi_model = TPPMIModel(ppmi_models, dates="years")

# Evaluation Method

## MRR (Mean Reciprocal Rank)

In [115]:
def calculate_mrr_for_key(data_dict, key):
    """
    Calculate the Mean Reciprocal Rank (MRR) for a given key in the data dictionary.

    Parameters:
    data_dict (dict): The dictionary containing years as keys and lists of word-score pairs as values.
    key (str): The key in the dictionary to calculate MRR for. Assumes format 'word-year'.

    Returns:
    float: The MRR for the given key.
    """
    test_word = key.split('-')[0]  # Assuming the "test word" is the part of the key before the hyphen
    total_reciprocal_rank = 0
    num_years = 0

    for year, word_score_pairs in data_dict[key].items():
        for rank, (word, score) in enumerate(word_score_pairs, start=1):
            if word == test_word:
                total_reciprocal_rank += 1.0 / rank
                break  # Stop looking once the first instance of the test word is found
        num_years += 1

    # Calculate MRR
    if num_years > 0:
        return total_reciprocal_rank / num_years
    else:
        return 0.0  # Return 0 if there are no years/data to calculate MRR

In [116]:
first_key = "bush-1990"
print(first_key)
first

bush-1990


{'1990': [['bush', 1],
  ['bushs', 0.866192638874054],
  ['gorbachev', 0.8211690783500671],
  ['lithuania', 0.7609381079673767],
  ['conservatives', 0.7489315867424011],
  ['hussein', 0.7443727254867554],
  ['shamir', 0.734448254108429],
  ['gorbachevs', 0.7306549549102783],
  ['stalemate', 0.7097168564796448],
  ['husseins', 0.7055761814117432]],
 '1991': [['bush', 0.948120653629303],
  ['bushs', 0.8322234153747559],
  ['sununu', 0.8044367432594299],
  ['gorbachev', 0.7639114856719971],
  ['gorbachevs', 0.760785698890686],
  ['hosni', 0.7602129578590393],
  ['mubarak', 0.7572333812713623],
  ['baker', 0.75714510679245],
  ['kemp', 0.7527531385421753],
  ['yeltsin', 0.7031477689743042]],
 '1992': [['bush', 0.9445867538452148],
  ['bushs', 0.8651612401008606],
  ['clinton', 0.7914848327636719],
  ['yeltsin', 0.7668893933296204],
  ['clintons', 0.7362542748451233],
  ['reelected', 0.7336463928222656],
  ['perot', 0.7271304130554199],
  ['mubarak', 0.7240527272224426],
  ['aides', 0.71862

In [117]:
test_list = test_data_1[test_data_1.truth == "bush-1990"].copy()

In [118]:
test_list['year'] = test_list['equivalent'].apply(lambda x: int(x.split('-')[1]))  # Extract year and convert to int
test_list = test_list.sort_values(by='year')  # Sort by the new 'year' column
test_list = test_list.drop('year', axis=1)

In [119]:
test_list

Unnamed: 0,truth,equivalent
0,bush-1990,clinton-1992
1,bush-1990,clinton-1993
2,bush-1990,clinton-1994
3,bush-1990,clinton-1995
4,bush-1990,clinton-1996
5,bush-1990,clinton-1997
6,bush-1990,clinton-1998
7,bush-1990,clinton-1999
8,bush-1990,bush-2000
9,bush-1990,bush-2001


In [None]:
first_key = next(iter(cade_similarities))
first = cade_similarities[next(iter(cade_similarities))]

In [230]:
def calculate_rank_metric(similarities: dict, test_data: pd.DataFrame, metric = "MRR", k = 10) -> float:
    ranks = []
    for key, value in similarities.items():
        rank = calculate_mean_rank(key, value, test_data, metric, k)

        ranks.append(rank)

    if ranks:  # Ensure division by 0 does not occur
        mean_rank = sum(ranks) / len(ranks)
    else:
        mean_rank = 0

    return mean_rank

In [231]:
def calculate_mean_rank(test_key: str, testcase: dict, test_data: pd.DataFrame, metric = "MRR", k = 10)-> float:

    test_data_for_key = test_data[test_data["truth"] == test_key]
    ranks = []

    for key, value in testcase.items():
        test_data_for_year = test_data_for_key[test_data_for_key["equivalent"].str.endswith(key)]
        word_list = [item[0] for item in value]

        if len(test_data_for_year) == 0:
            continue  # Skip if no data for year, as there's nothing to rank

        # Assuming calculate_reciprocal_rank returns the reciprocal rank of the first correct answer
        target_word = test_data_for_year["equivalent"].iloc[0].split("-")[0]
        if metric == "MRR":
            rank = calculate_reciprocal_rank(word_list, target_word)
        else:
            rank = calculate_precision_at_k(word_list, target_word, k)

        ranks.append(rank)

    if ranks:  # Ensure division by 0 does not occur
        mean_rank = sum(ranks) / len(ranks)
    else:
        mean_rank = 0

    return mean_rank

In [232]:
def calculate_reciprocal_rank(test_list: list, test_word: str) -> float:
    """
    Calculate the reciprocal rank for a given test word in a list of strings.

    Parameters:
    test_list (list of str): The list of strings to search through.
    test_word (str): The correct answer to find in the test_list.
    Returns:
    float: The reciprocal rank of the test_word in test_list, or 0 if not found.
    """
    try:
        rank = test_list.index(test_word) + 1  # Adding 1 because index is 0-based and rank is 1-based
        return 1.0 / rank
    except ValueError:
        return 0.0  # test_word not found in test_list

In [233]:
def calculate_precision_at_k(test_list: list, test_word: str, k: int) -> int:
    """
    Calculate the precision at K for a given test word in a list of strings.

    Parameters:
    test_list (list of str): The list of strings to search through, assumed to be ordered by relevance.
    test_word (str): The correct answer to find in the test_list.
    k (int): The number of top items to consider for calculating precision.

    Returns:
    int: The precision at K for the test_word in test_list.
         If the target word is among these K words, then the Precision@K for test i
         (denoted P@K[i]) is 1; else, it is 0
    """
    if k <= 0:
        raise ValueError("k must be a positive integer")

    # Take the top K elements from the list
    top_k = test_list[:k]

    # Check if the test_word is within the top K elements
    if test_word in top_k:
        return 1
    else:
        return 0

In [240]:
calculate_rank_metric(cade_similarities, test_data_1, metric="MP", k=1)

0.24758672564744716

In [238]:
for key, value in first.items():
    print("Key")
    print(type(key))
    print(key)
    word_list = [item[0] for item in value]
    print("value")
    print(type(word_list))
    print(word_list)
    print(calculate_precision_at_k(word_list, "gorbachev", k = 2))
    break

Key
<class 'str'>
1990
value
<class 'list'>
['bush', 'bushs', 'gorbachev', 'lithuania', 'conservatives', 'hussein', 'shamir', 'gorbachevs', 'stalemate', 'husseins']
0


# Experiment

To examine the quality of embedding alignment, we create a task to query equivalences across years.

For example, given obama-2012, we want to query its equivalent word in 2002. As we know obama is the U.S. president in 2012; its equivalent in 2002 is bush, who was the U.S. president at that time. In this way, we create two testsets

In [None]:
test_data_1.head()

# Scrapyard

In [201]:
def calculate_mean_reciprocal_rank_test(test_key: str, testcase: dict, test_data: pd.DataFrame):
    print(test_key)
    test_data_for_key = test_data[test_data["truth"] == test_key]
    print("test_data_for_key")
    print(f"length: {len(test_data_for_key)}")
    print(test_data_for_key)
    ranks = []
    for key, value in testcase.items():
        print(f"Key: {key}")
        test_data_for_year = test_data_for_key[test_data_for_key["equivalent"].str.endswith(key)]
        word_list = [item[0] for item in value]
        print("WORD-LIST")
        print(word_list)
        if len(test_data_for_year) == 0:
            print("CONTINUE")
            continue # this means that it is the same year as the word we want to test --> no need to calculate
        print("test_data_for_year")
        print(test_data_for_year)
        print(test_data_for_year["equivalent"].iloc[0].split("-")[0])
        reciprocal_rank = calculate_reciprocal_rank(word_list, test_data_for_year["equivalent"].iloc[0].split("-")[0])
        print(reciprocal_rank)
        ranks.append(reciprocal_rank)

    if ranks:  # Ensure division by zero does not occur
        mrr = sum(ranks) / len(ranks)
    else:
        mrr = 0
    return mrr