In [1]:
%load_ext autoreload

In [2]:
import json
import pickle
import pandas as pd
from glob import glob
from pathlib import Path
import scipy.sparse as sp
from tqdm.notebook import tqdm
from gensim.models.word2vec import Word2Vec

import sys
sys.path.append('../../')

import src.test.util as test_util

# Imports

In [3]:
path_to_test_data = Path("../../data") / "test"
path_to_tppmi_model = Path("../../data") / "ppmi-matrices" / "nyt-data"
path_to_twec_model = Path("../../model") / "nyt-data" / "cade" / "model"
path_to_static_model = Path("../../model") / "nyt-data" / "static"

# Setup

## Testsets

### Testset 1

Based on publicly recorded knowledge that for each year lists different names for a particular role, such as U.S. president, U.K. prime minister, NFL superbowl champion team, and so on.

In [4]:
test_data_1_all = pd.read_csv(path_to_test_data / "testset_1.csv")

In [5]:
test_data_1_all.columns = ['truth', 'equivalent']

In [6]:
test_data_1_all = test_data_1_all.sort_values(by='truth', ascending=True)

In [7]:
test_cases_1_all = test_data_1_all['truth'].unique()

In [8]:
print("Testset")
print(f"Testcases (all): {len(test_data_1_all)}")
print(f"Testcases (unique): {len(test_cases_1_all)}")

Testset
Testcases (all): 11027
Testcases (unique): 499


Now we want to split the testset into static & dynamic testcases as was done by Di Carlo et al. in their paper "Training Temporal Word Embeddings with a Compass"

In [9]:
# Splitting the columns into words and years
split_truth = test_data_1_all['truth'].str.split('-', expand=True)
split_equivalent = test_data_1_all['equivalent'].str.split('-', expand=True)

# Creating masks for "static" and "dynamic" conditions
static_mask = split_truth[0] == split_equivalent[0]
dynamic_mask = split_truth[0] != split_equivalent[0]

# Applying the masks to create the separate DataFrames
test_data_1_static = test_data_1_all[static_mask]
test_data_1_dynamic = test_data_1_all[dynamic_mask]

In [10]:
test_cases_1_static = test_data_1_static['truth'].unique()
test_cases_1_dynamic = test_data_1_dynamic['truth'].unique()

In [11]:
print("Static")
print(f"Testcases (all): {len(test_data_1_static)}")
print(f"Testcases (unique): {len(test_cases_1_static)}")

Static
Testcases (all): 2937
Testcases (unique): 443


In [12]:
print("Dynamic")
print(f"Testcases (all): {len(test_data_1_dynamic)}")
print(f"Testcases (unique): {len(test_cases_1_dynamic)}")

Dynamic
Testcases (all): 8090
Testcases (unique): 499


## Models

### TWEC

In [13]:
model_filenames_cade = glob(str(path_to_twec_model / "*_data.model"))

In [14]:
model_filenames_cade

['../../model/nyt-data/cade/model/1994_data.model',
 '../../model/nyt-data/cade/model/2000_data.model',
 '../../model/nyt-data/cade/model/1999_data.model',
 '../../model/nyt-data/cade/model/2011_data.model',
 '../../model/nyt-data/cade/model/2008_data.model',
 '../../model/nyt-data/cade/model/2014_data.model',
 '../../model/nyt-data/cade/model/2005_data.model',
 '../../model/nyt-data/cade/model/1991_data.model',
 '../../model/nyt-data/cade/model/1996_data.model',
 '../../model/nyt-data/cade/model/2002_data.model',
 '../../model/nyt-data/cade/model/2013_data.model',
 '../../model/nyt-data/cade/model/2016_data.model',
 '../../model/nyt-data/cade/model/2007_data.model',
 '../../model/nyt-data/cade/model/1993_data.model',
 '../../model/nyt-data/cade/model/2009_data.model',
 '../../model/nyt-data/cade/model/2015_data.model',
 '../../model/nyt-data/cade/model/1990_data.model',
 '../../model/nyt-data/cade/model/2004_data.model',
 '../../model/nyt-data/cade/model/2001_data.model',
 '../../mode

In [15]:
# load models
models_cade = {f"model_{model_file.split('_data')[0][-4:]}":Word2Vec.load(model_file) for model_file in tqdm(model_filenames_cade)}

  0%|          | 0/27 [00:00<?, ?it/s]

In [16]:
models_cade = {model_key: models_cade[model_key] for model_key in sorted(models_cade, key=lambda x: int(x.split('_')[1]))}

In [17]:
models_cade.keys()

dict_keys(['model_1990', 'model_1991', 'model_1992', 'model_1993', 'model_1994', 'model_1995', 'model_1996', 'model_1997', 'model_1998', 'model_1999', 'model_2000', 'model_2001', 'model_2002', 'model_2003', 'model_2004', 'model_2005', 'model_2006', 'model_2007', 'model_2008', 'model_2009', 'model_2010', 'model_2011', 'model_2012', 'model_2013', 'model_2014', 'model_2015', 'model_2016'])

Create dictionary of testsets that contain all test-words along with their embedding in the respective year

In [18]:
test_case_dict_cade_all = test_util.create_test_case_dict_cade(test_cases_1_all, models_cade)

2 Testcases are not in the vocab of the model(s)


In [19]:
test_case_dict_cade_static = test_util.create_test_case_dict_cade(test_cases_1_static, models_cade)

2 Testcases are not in the vocab of the model(s)


In [20]:
test_case_dict_cade_dynamic = test_util.create_test_case_dict_cade(test_cases_1_dynamic, models_cade)

2 Testcases are not in the vocab of the model(s)


Retrieve most similar words for each testword in each year

In [21]:
similarities_cade_all = test_util.get_similarities_of_models(models_cade, test_case_dict_cade_all)

  0%|          | 0/497 [00:00<?, ?it/s]

In [22]:
similarities_cade_static = test_util.get_similarities_of_models(models_cade, test_case_dict_cade_static)

  0%|          | 0/441 [00:00<?, ?it/s]

In [23]:
similarities_cade_dynamic = test_util.get_similarities_of_models(models_cade, test_case_dict_cade_dynamic)

  0%|          | 0/497 [00:00<?, ?it/s]

### Static Word2Vec

In [24]:
model_static = Word2Vec.load(str(path_to_static_model / "w2v_model.model"))

In [25]:
test_case_dict_static_all = test_util.create_test_case_dict_static(model_static, test_cases_1_all)

0 Testcases are not in the vocab of the model


In [26]:
test_case_dict_static_static = test_util.create_test_case_dict_static(model_static, test_cases_1_static)

0 Testcases are not in the vocab of the model


In [27]:
test_case_dict_static_dynamic = test_util.create_test_case_dict_static(model_static, test_cases_1_dynamic)

0 Testcases are not in the vocab of the model


In [28]:
similarities_static_all = test_util.get_similarities_of_models_static(model_static, test_case_dict_static_all)

  0%|          | 0/499 [00:00<?, ?it/s]

In [29]:
similarities_static_static = test_util.get_similarities_of_models_static(model_static, test_case_dict_static_static)

  0%|          | 0/443 [00:00<?, ?it/s]

In [30]:
similarities_static_dynamic = test_util.get_similarities_of_models_static(model_static, test_case_dict_static_dynamic)

  0%|          | 0/499 [00:00<?, ?it/s]

### TPPMI

In [85]:
%autoreload 2

from src.packages.TPPMI.ppmi_model import PPMIModel
from src.packages.TPPMI.tppmi_model import TPPMIModel
import src.test.util as test_util

In [86]:
ppmi_data_files = sorted(glob(str(path_to_tppmi_model  / "*.npz")))
words_files = sorted(glob(str(path_to_tppmi_model  / "*.pkl")))

Split context-words from timestamped-vocabularies

In [87]:
context_words_file = [path for path in words_files if "context-words" in path]
ppmi_vocab_files = [path for path in words_files if "context-words" not in path]

In [88]:
# Get ppmi-matrices and vocab
ppmi_matrices = {}

for filenames in zip(ppmi_vocab_files, ppmi_data_files):
    ppmi_matrix = sp.load_npz(filenames[1])
    with open(filenames[0], "rb") as f:
        vocab = pickle.load(f)
    key = filenames[0].split("ppmi-")[2][0:4]
    ppmi_matrices[key] = {"ppmi_matrix" : ppmi_matrix, "vocab": vocab}

# Get common context-words
with open(context_words_file[0], "rb") as f:
    context_words = pickle.load(f)

In [89]:
ppmi_matrices.keys()

dict_keys(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016'])

Create ppmi_model objects

In [90]:
ppmi_models = {key: PPMIModel.construct_from_data(ppmi_data["ppmi_matrix"], ppmi_data["vocab"], context_words, normalize=True) for key, ppmi_data in ppmi_matrices.items()}

In [91]:
tppmi_model = TPPMIModel(ppmi_models, dates="years", smooth=False)

In [93]:
test_case_dict_tppmi_all = test_util.create_test_case_dict_tppmi(tppmi_model, test_cases_1_all)

19 Testcases are not in the vocab of the model


In [94]:
test_case_dict_tppmi_static = test_util.create_test_case_dict_tppmi(tppmi_model, test_cases_1_static)

15 Testcases are not in the vocab of the model


In [95]:
test_case_dict_tppmi_dynamic = test_util.create_test_case_dict_tppmi(tppmi_model, test_cases_1_dynamic)

19 Testcases are not in the vocab of the model


In [96]:
similarities_tppmi_all = test_util.get_similarites_of_models_tppmi(tppmi_model, test_case_dict_tppmi_all)

  0%|          | 0/480 [00:00<?, ?it/s]

In [97]:
similarities_tppmi_static = test_util.get_similarites_of_models_tppmi(tppmi_model, test_case_dict_tppmi_static)

  0%|          | 0/428 [00:00<?, ?it/s]

In [98]:
similarities_tppmi_dynamic = test_util.get_similarites_of_models_tppmi(tppmi_model, test_case_dict_tppmi_dynamic)

  0%|          | 0/480 [00:00<?, ?it/s]

# Experiment

To examine the quality of embedding alignment, we create a task to query equivalences across years.

For example, given obama-2012, we want to query its equivalent word in 2002. As we know obama is the U.S. president in 2012; its equivalent in 2002 is bush, who was the U.S. president at that time. In this way, we create two testsets.

All results are rounded to three decimal places.

In [100]:
cutoffs = [1, 3, 5, 10]
list_of_types = ["static", "dynamic", "all"]

list_of_data_cade = [[similarities_cade_static, test_data_1_static],
                     [similarities_cade_dynamic, test_data_1_dynamic], [similarities_cade_all, test_data_1_all]]
list_of_data_tppmi = [[similarities_tppmi_static, test_data_1_static],
                       [similarities_tppmi_dynamic, test_data_1_dynamic], [similarities_tppmi_all, test_data_1_all]]
list_of_data_static = [[similarities_static_static, test_data_1_static],
                       [similarities_static_dynamic, test_data_1_dynamic], [similarities_static_all, test_data_1_all]]

config_dict_cade = {key: value for key, value in zip(list_of_types, list_of_data_cade)}
config_dict_tppmi = {key: value for key, value in zip(list_of_types, list_of_data_tppmi)}
config_dict_static = {key: value for key, value in zip(list_of_types, list_of_data_static)}

scores_cade = {key: dict() for key in list_of_types}
scores_static = {key: dict() for key in list_of_types}
scores_tppmi = {key: dict() for key in list_of_types}

## Mean Reciprocal Rank (@10)

The Mean Reciprocal Rank (MRR) is a statistical measure used to evaluate the performance of a system that returns a ranked list of responses to queries. It is the average of the reciprocal ranks of the first correct answer for each query, where the reciprocal rank is the inverse of the rank at which the first relevant answer is found.
It is evaluated @10

### TWEC

In [101]:
for key, value in tqdm(config_dict_cade.items()):
    scores_cade[key]["mrr@10"] = round(test_util.calculate_rank_metric(value[0], value[1], metric='MRR'), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

### Static Word2Vec

In [102]:
for key, value in tqdm(config_dict_static.items()):
    scores_static[key]["mrr@10"] = round(test_util.calculate_rank_metric_static(value[0], value[1], metric='MRR'), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

### TPPMI

In [106]:
for key, value in tqdm(config_dict_tppmi.items()):
    scores_tppmi[key]["mrr@10"] = round(test_util.calculate_rank_metric(value[0], value[1], metric='MRR'), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

## Mean Precision (@K)

As introduced by Yao et al.(2018) the MP@K is defined as such: consider the K words most similar to the query embedding for the given year. The Precision@K for a particular test i, represented as P@K[i], equals 1 if the target word appears within this set of K words; otherwise, it assumes a value of 0.

### TWEC

In [107]:
for key, value in tqdm(config_dict_cade.items()):
    for k in cutoffs:
        score_key = f"mp@{k}"
        scores_cade[key][score_key] = round(test_util.calculate_rank_metric(value[0], value[1], metric='MP', k=k), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

### Static Word2Vec

In [108]:
for key, value in tqdm(config_dict_static.items()):
    for k in cutoffs:
        score_key = f"mp@{k}"
        scores_static[key][score_key] = round(test_util.calculate_rank_metric_static(value[0], value[1], metric='MP', k=k), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

### TPPMI

In [109]:
for key, value in tqdm(config_dict_tppmi.items()):
    for k in cutoffs:
        score_key = f"mp@{k}"
        scores_tppmi[key][score_key] = round(test_util.calculate_rank_metric(value[0], value[1], metric='MP', k=k), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

## Results

In [113]:
score_table_cade = pd.DataFrame(scores_cade).T
score_table_static = pd.DataFrame(scores_static).T
score_table_tppmi_500 = pd.DataFrame(scores_tppmi).T
print("Scores of TWEC")
display(score_table_cade)
print("Scores of TPPMI")
display(score_table_tppmi_500)
print("Scores of Static Word2Vec (Baseline)")
display(score_table_static)

Scores of TWEC


Unnamed: 0,mrr@10,mp@1,mp@3,mp@5,mp@10
static,0.587,0.501,0.656,0.708,0.746
dynamic,0.346,0.268,0.398,0.451,0.512
all,0.394,0.316,0.448,0.501,0.556


Scores of TPPMI


Unnamed: 0,mrr@10,mp@1,mp@3,mp@5,mp@10
static,0.253,0.192,0.292,0.33,0.395
dynamic,0.111,0.079,0.125,0.154,0.196
all,0.142,0.104,0.162,0.192,0.238


Scores of Static Word2Vec (Baseline)


Unnamed: 0,mrr@10,mp@1,mp@3,mp@5,mp@10
static,1.0,1.0,1.0,1.0,1.0
dynamic,0.147,0.0,0.24,0.372,0.463
all,0.374,0.266,0.442,0.539,0.606


----------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------

In [114]:
output_dir = Path("../../data/results/nyt-data")
output_dir.mkdir(parents=True, exist_ok=True)

# Saving the CADE score table to CSV
score_table_cade.to_csv(output_dir / "score_table_cade.csv", index=True)

# Saving the TPPMI score table to CSV
score_table_tppmi_500.to_csv(output_dir / 'score_table_tppmi_500.csv', index=True)

# Saving the Static Word2Vec (Baseline) score table to CSV
score_table_static.to_csv(output_dir / 'score_table_static.csv', index=True)
