In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
import json
import copy
import pickle
import pandas as pd
from glob import glob
from pathlib import Path
import scipy.sparse as sp
from tqdm.notebook import tqdm
from gensim.models.word2vec import Word2Vec

import sys
sys.path.append('../../')

import src.test.util as test_util

# Imports

In [3]:
path_to_test_data = Path("../../data") / "test"
path_to_tppmi_model = Path("../../data") / "ppmi-matrices" / "nyt-data"
path_to_twec_model = Path("../../model") / "nyt-data" / "cade" / "model"
path_to_static_model = Path("../../model") / "nyt-data" / "static"

# Setup

## Testsets

### Testset 1

Based on publicly recorded knowledge that for each year lists different names for a particular role, such as U.S. president, U.K. prime minister, NFL superbowl champion team, and so on.

In [22]:
test_data = pd.read_csv(path_to_test_data / "testset_1_enriched.csv")

In [23]:
test_data.columns = ['truth', 'equivalent', 'token', 'tag']

In [24]:
test_data = test_data.sort_values(by='truth', ascending=True)

In [29]:
path_to_entity_list = Path("../../data/processed/entities/persons_pantheon_data.csv")
entity_list = pd.read_csv(path_to_entity_list)
entity_list.columns = ["token", "domain"]

In [30]:
test_data.head()

Unnamed: 0,truth,equivalent,token,tag
0,49ers-1990,patriots-2015,49ers,unknown
22,49ers-1990,patriots-2005,49ers,unknown
21,49ers-1990,giants-1991,49ers,unknown
20,49ers-1990,redskins-1992,49ers,unknown
19,49ers-1990,broncos-2016,49ers,unknown


In [31]:
entity_list

Unnamed: 0,token,domain
0,"""noynoy""",INSTITUTIONS
1,'ulukalala,INSTITUTIONS
2,13th,INSTITUTIONS
3,14th,INSTITUTIONS
4,`abbas,INSTITUTIONS
...,...,...
12118,ọbasanjọ,INSTITUTIONS
12119,‘abdu’l-hamid,INSTITUTIONS
12120,‘abdu’l-muttalib,INSTITUTIONS
12121,シルショ,ARTS


In [32]:
merged_test_data = pd.merge(entity_list, test_data, on='token', how='inner')

In [33]:
merged_test_data

Unnamed: 0,token,domain,truth,equivalent,tag
0,agassi,SPORTS,agassi-1999,sampras-1998,I-per
1,agassi,SPORTS,agassi-1999,hewitt-2001,B-per
2,agassi,SPORTS,agassi-1999,hewitt-2001,B-per
3,agassi,SPORTS,agassi-1999,hewitt-2001,I-per
4,agassi,SPORTS,agassi-1999,hewitt-2001,I-per
...,...,...,...,...,...
553244,yeltsin,INSTITUTIONS,yeltsin-1999,putin-2014,unknown
553245,yeltsin,INSTITUTIONS,yeltsin-1999,putin-2015,unknown
553246,yeltsin,INSTITUTIONS,yeltsin-1999,medvedev-2008,unknown
553247,yeltsin,INSTITUTIONS,yeltsin-1999,medvedev-2009,unknown


In [10]:
test_data_person = merged_test_data[['truth', 'equivalent', 'domain']]

In [11]:
test_data_person = test_data_person[test_data_person.entity == "PERSON"]

In [12]:
test_data = test_data_person

In [13]:
test_cases = test_data['truth'].unique()

In [14]:
test_cases

array(['albright-1997', 'albright-1998', 'albright-1999', 'albright-2000',
       'baker-1990', 'baker-1991', 'barak-1999', 'barak-2000',
       'bates-1991', 'benedict-2005', 'benedict-2006', 'benedict-2007',
       'benedict-2008', 'benedict-2009', 'benedict-2010', 'benedict-2011',
       'benedict-2012', 'bernanke-2006', 'bernanke-2007', 'bernanke-2008',
       'bernanke-2009', 'bernanke-2010', 'bernanke-2011', 'bernanke-2012',
       'bernanke-2013', 'berry-2002', 'biden-2008', 'biden-2009',
       'biden-2010', 'biden-2011', 'biden-2012', 'biden-2013',
       'biden-2014', 'biden-2015', 'biden-2016', 'blair-1997',
       'blair-1998', 'blair-1999', 'blair-2000', 'blair-2001',
       'blair-2002', 'blair-2003', 'blair-2004', 'blair-2005',
       'blair-2006', 'blasio-2014', 'blasio-2015', 'blasio-2016',
       'bloomberg-2002', 'bloomberg-2003', 'bloomberg-2004',
       'bloomberg-2005', 'bloomberg-2006', 'bloomberg-2007',
       'bloomberg-2008', 'bloomberg-2009', 'bloomberg-2010'

In [15]:
print("Testset")
print(f"Testcases (all): {len(test_data)}")
print(f"Testcases (unique): {len(test_cases)}")

Testset
Testcases (all): 488876
Testcases (unique): 403


Now we want to split the testset into static & dynamic testcases as was done by Di Carlo et al. in their paper "Training Temporal Word Embeddings with a Compass"

In [16]:
# Splitting the columns into words and years
split_truth = test_data['truth'].str.split('-', expand=True)
split_equivalent = test_data['equivalent'].str.split('-', expand=True)

# Creating masks for "static" and "dynamic" conditions
static_mask = split_truth[0] == split_equivalent[0]
dynamic_mask = split_truth[0] != split_equivalent[0]

# Applying the masks to create the separate DataFrames
test_data_static = test_data[static_mask]
test_data_dynamic = test_data[dynamic_mask]

In [17]:
test_cases_static = test_data_static['truth'].unique()
test_cases_dynamic = test_data_dynamic['truth'].unique()

In [18]:
print("Static")
print(f"Testcases (all): {len(test_data_static)}")
print(f"Testcases (unique): {len(test_cases_static)}")

Static
Testcases (all): 162978
Testcases (unique): 367


In [19]:
print("Dynamic")
print(f"Testcases (all): {len(test_data_dynamic)}")
print(f"Testcases (unique): {len(test_cases_dynamic)}")

Dynamic
Testcases (all): 325898
Testcases (unique): 403


## Models

### TWEC

In [20]:
model_filenames_cade = glob(str(path_to_twec_model / "*_data.model"))

In [21]:
# load models
models_cade = {f"model_{model_file.split('_data')[0][-4:]}":Word2Vec.load(model_file) for model_file in tqdm(model_filenames_cade)}

  0%|          | 0/27 [00:00<?, ?it/s]

In [22]:
models_cade = {model_key: models_cade[model_key] for model_key in sorted(models_cade, key=lambda x: int(x.split('_')[1]))}

In [23]:
models_cade.keys()

dict_keys(['model_1990', 'model_1991', 'model_1992', 'model_1993', 'model_1994', 'model_1995', 'model_1996', 'model_1997', 'model_1998', 'model_1999', 'model_2000', 'model_2001', 'model_2002', 'model_2003', 'model_2004', 'model_2005', 'model_2006', 'model_2007', 'model_2008', 'model_2009', 'model_2010', 'model_2011', 'model_2012', 'model_2013', 'model_2014', 'model_2015', 'model_2016'])

Create dictionary of testsets that contain all test-words along with their embedding in the respective year

In [24]:
test_case_dict_cade_all = test_util.create_test_case_dict_cade(test_cases, models_cade)

2 Testcases are not in the vocab of the model(s)


In [25]:
test_case_dict_cade_static = test_util.create_test_case_dict_cade(test_cases_static, models_cade)

2 Testcases are not in the vocab of the model(s)


In [26]:
test_case_dict_cade_dynamic = test_util.create_test_case_dict_cade(test_cases_dynamic, models_cade)

2 Testcases are not in the vocab of the model(s)


Retrieve most similar words for each testword in each year

In [27]:
similarities_cade_all = test_util.get_similarities_of_models(models_cade, test_case_dict_cade_all, entity_list=entity_list, label="PERSON", top_n=10)

  0%|          | 0/401 [00:00<?, ?it/s]

barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 30
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000

In [28]:
similarities_cade_static = test_util.get_similarities_of_models(models_cade, test_case_dict_cade_static, entity_list=entity_list, label="PERSON", top_n=10)

  0%|          | 0/365 [00:00<?, ?it/s]

barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 30
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000

In [29]:
similarities_cade_dynamic = test_util.get_similarities_of_models(models_cade, test_case_dict_cade_dynamic, entity_list=entity_list, label="PERSON", top_n=10)

  0%|          | 0/401 [00:00<?, ?it/s]

barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-1999
incremented by: 10
barak-1999
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 30
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000
incremented by: 10
barak-2000
incremented by: 20
barak-2000

### Static Word2Vec

In [30]:
model_static = Word2Vec.load(str(path_to_static_model / "w2v_model.model"))

In [31]:
test_case_dict_static_all = test_util.create_test_case_dict_static(model_static, test_cases)

0 Testcases are not in the vocab of the model


In [32]:
test_case_dict_static_static = test_util.create_test_case_dict_static(model_static, test_cases_static)

0 Testcases are not in the vocab of the model


In [33]:
test_case_dict_static_dynamic = test_util.create_test_case_dict_static(model_static, test_cases_dynamic)

0 Testcases are not in the vocab of the model


In [34]:
similarities_static_all = test_util.get_similarities_of_models_static(model_static, test_case_dict_static_all, entity_list=entity_list, filter=True, top_n=10)

  0%|          | 0/403 [00:00<?, ?it/s]

bridges-2010
incremented by: 10
bridges-2010
incremented by: 20
bridges-2010
incremented by: 30
bridges-2010
incremented by: 40
bridges-2010
incremented by: 50
ebola-2015
incremented by: 10
ebola-2015
incremented by: 20
ebola-2015
incremented by: 30
ebola-2015
incremented by: 40
ebola-2015
incremented by: 50
foster-1992
incremented by: 10
foster-1992
incremented by: 20
heat-2006
incremented by: 10
heat-2006
incremented by: 20
heat-2012
incremented by: 10
heat-2012
incremented by: 20
heat-2013
incremented by: 10
heat-2013
incremented by: 20
hunt-1998
incremented by: 10
hunt-1998
incremented by: 20
hunt-1998
incremented by: 30
knicks-1999
incremented by: 10
knicks-1999
incremented by: 20
mavericks-2011
incremented by: 10
mavericks-2011
incremented by: 20
pace-2005
incremented by: 10
pace-2005
incremented by: 20
pace-2005
incremented by: 30
pace-2006
incremented by: 10
pace-2006
incremented by: 20
pace-2006
incremented by: 30
penn-2004
incremented by: 10
penn-2004
incremented by: 20
penn-

In [35]:
similarities_static_static = test_util.get_similarities_of_models_static(model_static, test_case_dict_static_static, entity_list=entity_list, filter=True, top_n=10)

  0%|          | 0/367 [00:00<?, ?it/s]

heat-2006
incremented by: 10
heat-2006
incremented by: 20
heat-2012
incremented by: 10
heat-2012
incremented by: 20
heat-2013
incremented by: 10
heat-2013
incremented by: 20
pace-2005
incremented by: 10
pace-2005
incremented by: 20
pace-2005
incremented by: 30
pace-2006
incremented by: 10
pace-2006
incremented by: 20
pace-2006
incremented by: 30
penn-2004
incremented by: 10
penn-2004
incremented by: 20
penn-2009
incremented by: 10
penn-2009
incremented by: 20
sessions-1990
incremented by: 10
sessions-1990
incremented by: 20
sessions-1990
incremented by: 30
sessions-1990
incremented by: 40
sessions-1991
incremented by: 10
sessions-1991
incremented by: 20
sessions-1991
incremented by: 30
sessions-1991
incremented by: 40
sessions-1992
incremented by: 10
sessions-1992
incremented by: 20
sessions-1992
incremented by: 30
sessions-1992
incremented by: 40


In [36]:
similarities_static_dynamic = test_util.get_similarities_of_models_static(model_static, test_case_dict_static_dynamic, entity_list=entity_list, filter=True, top_n=10)

  0%|          | 0/403 [00:00<?, ?it/s]

bridges-2010
incremented by: 10
bridges-2010
incremented by: 20
bridges-2010
incremented by: 30
bridges-2010
incremented by: 40
bridges-2010
incremented by: 50
ebola-2015
incremented by: 10
ebola-2015
incremented by: 20
ebola-2015
incremented by: 30
ebola-2015
incremented by: 40
ebola-2015
incremented by: 50
foster-1992
incremented by: 10
foster-1992
incremented by: 20
heat-2006
incremented by: 10
heat-2006
incremented by: 20
heat-2012
incremented by: 10
heat-2012
incremented by: 20
heat-2013
incremented by: 10
heat-2013
incremented by: 20
hunt-1998
incremented by: 10
hunt-1998
incremented by: 20
hunt-1998
incremented by: 30
knicks-1999
incremented by: 10
knicks-1999
incremented by: 20
mavericks-2011
incremented by: 10
mavericks-2011
incremented by: 20
pace-2005
incremented by: 10
pace-2005
incremented by: 20
pace-2005
incremented by: 30
pace-2006
incremented by: 10
pace-2006
incremented by: 20
pace-2006
incremented by: 30
penn-2004
incremented by: 10
penn-2004
incremented by: 20
penn-

### TPPMI

In [37]:
%autoreload 2

from src.packages.TPPMI.ppmi_model import PPMIModel
from src.packages.TPPMI.tppmi_model import TPPMIModel
import src.test.util as test_util

In [38]:
# Collecting .npz files
ppmi_data_files = sorted([file for file in glob(str(path_to_tppmi_model / "2000" / "*.npz"))])

# Collecting .pkl files
words_files = sorted([file for file in glob(str(path_to_tppmi_model / "2000" / "*.pkl"))])

Split context-words from timestamped-vocabularies

In [39]:
context_words_files = [path for path in words_files if "context-words" in path]
ppmi_vocab_files = [path for path in words_files if "context-words" not in path]

In [40]:
for context_words_file in context_words_files:
    print(context_words_file)

../../data/ppmi-matrices/nyt-data/2000/context-words.pkl


In [41]:
# Get ppmi-matrices and vocab
ppmi_matrices = {}

for filenames in zip(ppmi_vocab_files, ppmi_data_files):
    ppmi_matrix = sp.load_npz(filenames[1])
    with open(filenames[0], "rb") as f:
        vocab = pickle.load(f)
    key = filenames[0].split("ppmi-")[2][0:4]
    ppmi_matrices[key] = {"ppmi_matrix" : ppmi_matrix, "vocab": vocab}

# Get common context-words
for context_words_file in context_words_files:
    with open(context_words_file, "rb") as f:
        context_words = pickle.load(f)

In [42]:
ppmi_matrices.keys()

dict_keys(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016'])

Create ppmi_model objects

In [43]:
ppmi_models = {key: PPMIModel.construct_from_data(ppmi_data["ppmi_matrix"], ppmi_data["vocab"], context_words, normalize=True) for key, ppmi_data in ppmi_matrices.items()}

In [44]:
tppmi_model = TPPMIModel(ppmi_models, dates="years", smooth=False)

Create test-dictionaries for all test cases

In [45]:
test_case_dict_tppmi_all = test_util.create_test_case_dict_tppmi(tppmi_model, test_cases)

15 Testcases are not in the vocab of the model


Create test-dictionaries for static test cases

In [46]:
test_case_dict_tppmi_static = test_util.create_test_case_dict_tppmi(tppmi_model, test_cases_static)

13 Testcases are not in the vocab of the model


Create test-dictionaries for dynamic test cases

In [47]:
test_case_dict_tppmi_dynamic = test_util.create_test_case_dict_tppmi(tppmi_model, test_cases_dynamic)

15 Testcases are not in the vocab of the model


#### Calculate similarities

Calculate similarities for all testcases

In [48]:
from itertools import islice
def first_n_items(d, n=5):
    return dict(islice(d.items(), n))

In [49]:
test_case_dict_temp = first_n_items(test_case_dict_tppmi_all, n=5)

In [50]:
similarities_tppmi_all = test_util.get_similarites_of_models_tppmi(tppmi_model, test_case_dict_tppmi_all,
                                                                   entity_list=entity_list, filter=True, label="PERSON",top_n=10)

  0%|          | 0/388 [00:00<?, ?it/s]

incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 50
incremented by: 10
incremented by: 20
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 50
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 50
incremented by: 10
incremented 

In [51]:
similarities_tppmi_all

{'albright-1997': {'1990': [('baker', 0.19029661993812094),
   ('expert', 0.18811585318672527),
   ('political', 0.1834847041668293),
   ('draft', 0.1738689741544955),
   ('levin', 0.17291336077765238),
   ('bushs', 0.1720933096501163),
   ('encourage', 0.1674691243688012),
   ('acting', 0.1670667302974382),
   ('bush', 0.16524899779854857),
   ('idea', 0.16492371481766988)],
  '1991': [('bush', 0.2277487954403426),
   ('baker', 0.208811069027414),
   ('adviser', 0.20079191348146183),
   ('minister', 0.19290879456099527),
   ('talks', 0.18494888338291582),
   ('saddam', 0.1766433463077392),
   ('weed', 0.16969769635529114),
   ('discussed', 0.16854762981511145),
   ('croatia', 0.16766581880452716),
   ('cheney', 0.16708504294093174)],
  '1992': [('talks', 0.18304853004906668),
   ('clintons', 0.18033855020104955),
   ('momentarily', 0.17823153146690757),
   ('baker', 0.17374231514068678),
   ('saddam', 0.16834252007290854),
   ('crucial', 0.1660268959319778),
   ('bush', 0.165230757256

Calculate similarities for static testcases

In [57]:
similarities_tppmi_static = test_util.get_similarites_of_models_tppmi(tppmi_model, test_case_dict_tppmi_static,
                                                                      entity_list=entity_list, filter=True, label="PERSON",top_n=10)

  0%|          | 0/354 [00:00<?, ?it/s]

incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 50
incremented by: 10
incremented by: 20
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 50
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 50
incremented by: 10
incremented 

Calculate similarities for dynamic testcases

In [58]:
similarities_tppmi_dynamic = test_util.get_similarites_of_models_tppmi(tppmi_model, test_case_dict_tppmi_dynamic,
                                                                       entity_list=entity_list, filter=True, label="PERSON",top_n=10)

  0%|          | 0/388 [00:00<?, ?it/s]

incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 50
incremented by: 10
incremented by: 20
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 50
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 10
incremented by: 20
incremented by: 30
incremented by: 40
incremented by: 50
incremented by: 10
incremented 

In [83]:
similarities_cade_all.keys()

dict_keys(['albright-1997', 'albright-1998', 'albright-1999', 'albright-2000', 'baker-1990', 'baker-1991', 'barak-1999', 'barak-2000', 'bates-1991', 'benedict-2005', 'benedict-2006', 'benedict-2007', 'benedict-2008', 'benedict-2009', 'benedict-2010', 'benedict-2011', 'benedict-2012', 'bernanke-2006', 'bernanke-2007', 'bernanke-2008', 'bernanke-2009', 'bernanke-2010', 'bernanke-2011', 'bernanke-2012', 'bernanke-2013', 'berry-2002', 'biden-2008', 'biden-2009', 'biden-2010', 'biden-2011', 'biden-2012', 'biden-2013', 'biden-2014', 'biden-2015', 'biden-2016', 'blair-1997', 'blair-1998', 'blair-1999', 'blair-2000', 'blair-2001', 'blair-2002', 'blair-2003', 'blair-2004', 'blair-2005', 'blair-2006', 'blasio-2014', 'blasio-2015', 'blasio-2016', 'bloomberg-2002', 'bloomberg-2003', 'bloomberg-2004', 'bloomberg-2005', 'bloomberg-2006', 'bloomberg-2007', 'bloomberg-2008', 'bloomberg-2009', 'bloomberg-2010', 'bloomberg-2011', 'bloomberg-2012', 'bloomberg-2013', 'brennan-2013', 'brennan-2014', 'brenn

In [92]:
test = similarities_cade_all["clinton-1999"]

In [94]:
"clinton-1999"

'clinton-1999'

In [95]:
test

{'1990': [('bush', 0.864136278629303),
  ('bushs', 0.7391064167022705),
  ('reagan', 0.7047344446182251),
  ('reagans', 0.6684464812278748),
  ('cuomos', 0.6382134556770325),
  ('gorbachev', 0.6017365455627441),
  ('mccain', 0.5757175087928772),
  ('lawmakers', 0.575276255607605),
  ('presidents', 0.5712372660636902),
  ('cuomo', 0.5629218816757202)],
 '1991': [('bush', 0.8855997920036316),
  ('bushs', 0.7569085359573364),
  ('sununu', 0.7443342804908752),
  ('kemp', 0.6881834268569946),
  ('reagan', 0.6733303070068359),
  ('baker', 0.6230989098548889),
  ('cheneys', 0.6161164045333862),
  ('cuomo', 0.5960902571678162),
  ('mandela', 0.5933858752250671),
  ('reagans', 0.5886446833610535)],
 '1992': [('bush', 0.8582387566566467),
  ('bushs', 0.8008391261100769),
  ('clinton', 0.7626042366027832),
  ('clintons', 0.6546137928962708),
  ('bakers', 0.6398282051086426),
  ('cuomos', 0.6350112557411194),
  ('reagan', 0.6135473847389221),
  ('mubarak', 0.5920022130012512),
  ('yeltsins', 0.588

interim persist

In [64]:
import json
path_to_interim_data = Path("../../data/interim")

with open(path_to_interim_data / 'similarities_tppmi_all.json', 'w') as file:
    json.dump(similarities_tppmi_all, file, indent=4)
with open(path_to_interim_data / "similarities_tppmi_static.json", "w") as file:
    json.dump(similarities_tppmi_static, file, indent=4)
with open(path_to_interim_data / "similarities_tppmi_dynamic.json", "w") as file:
    json.dump(similarities_tppmi_dynamic, file, indent=4)

In [62]:
with open(path_to_interim_data / 'similarities_tppmi_all.json', 'r') as file:
    similarities_tppmi_all = json.load(file)
with open(path_to_interim_data / 'similarities_tppmi_static.json', 'r') as file:
    similarities_tppmi_static = json.load(file)
with open(path_to_interim_data / 'similarities_tppmi_dynamic.json', 'r') as file:
    similarities_tppmi_dynamic = json.load(file)

# Experiment

To examine the quality of embedding alignment, we create a task to query equivalences across years.

For example, given obama-2012, we want to query its equivalent word in 2002. As we know obama is the U.S. president in 2012; its equivalent in 2002 is bush, who was the U.S. president at that time. In this way, we create two testsets.

All results are rounded to three decimal places.

In [65]:
cutoffs = [1, 3, 5, 10]
list_of_types = ["static", "dynamic", "all"]

list_of_data_cade = [[similarities_cade_static, test_data_static],
                     [similarities_cade_dynamic, test_data_dynamic], [similarities_cade_all, test_data]]

list_of_data_tppmi = [[similarities_tppmi_static, test_data_static],
                           [similarities_tppmi_dynamic, test_data_dynamic], [similarities_tppmi_all, test_data]]

list_of_data_static = [[similarities_static_static, test_data_static],
                       [similarities_static_dynamic, test_data_dynamic], [similarities_static_all, test_data]]

config_dict_cade = {key: value for key, value in zip(list_of_types, list_of_data_cade)}
config_dict_tppmi = {key: value for key, value in zip(list_of_types, list_of_data_tppmi)}
config_dict_static = {key: value for key, value in zip(list_of_types, list_of_data_static)}

scores_cade = {key: dict() for key in list_of_types}
scores_static = {key: dict() for key in list_of_types}
scores_tppmi = {key: dict() for key in list_of_types};

## Mean Reciprocal Rank (@10)

The Mean Reciprocal Rank (MRR) is a statistical measure used to evaluate the performance of a system that returns a ranked list of responses to queries. It is the average of the reciprocal ranks of the first correct answer for each query, where the reciprocal rank is the inverse of the rank at which the first relevant answer is found.
It is evaluated @10

### TWEC

In [66]:
similarities_tppmi_all.keys()

dict_keys(['albright-1997', 'albright-1998', 'albright-1999', 'albright-2000', 'baker-1990', 'baker-1991', 'barak-1999', 'barak-2000', 'bates-1991', 'benedict-2005', 'benedict-2007', 'benedict-2008', 'benedict-2009', 'benedict-2010', 'benedict-2011', 'benedict-2012', 'bernanke-2007', 'bernanke-2008', 'bernanke-2009', 'bernanke-2010', 'bernanke-2011', 'bernanke-2012', 'bernanke-2013', 'berry-2002', 'biden-2008', 'biden-2009', 'biden-2010', 'biden-2011', 'biden-2012', 'biden-2013', 'biden-2014', 'biden-2015', 'biden-2016', 'blair-1997', 'blair-1998', 'blair-1999', 'blair-2000', 'blair-2001', 'blair-2002', 'blair-2003', 'blair-2004', 'blair-2005', 'blair-2006', 'blasio-2014', 'blasio-2015', 'blasio-2016', 'bloomberg-2002', 'bloomberg-2003', 'bloomberg-2004', 'bloomberg-2005', 'bloomberg-2006', 'bloomberg-2007', 'bloomberg-2008', 'bloomberg-2009', 'bloomberg-2010', 'bloomberg-2011', 'bloomberg-2012', 'bloomberg-2013', 'brennan-2013', 'brennan-2014', 'brennan-2015', 'brennan-2016', 'bridges

In [67]:
for key, value in tqdm(config_dict_cade.items()):
    scores_cade[key]["mrr@10"] = round(test_util.calculate_rank_metric(value[0], value[1], metric='MRR', k=10), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

### Static Word2Vec

In [68]:
for key, value in tqdm(config_dict_static.items()):
    scores_static[key]["mrr@10"] = round(test_util.calculate_rank_metric_static(value[0], value[1], metric='MRR', k=10), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

### TPPMI

In [69]:
for key, value in tqdm(config_dict_tppmi.items()):
    scores_tppmi[key]["mrr@10"] = round(test_util.calculate_rank_metric(value[0], value[1], metric='MRR', k=10), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

## Mean Precision (@K)

As introduced by Yao et al.(2018) the MP@K is defined as such: consider the K words most similar to the query embedding for the given year. The Precision@K for a particular test i, represented as P@K[i], equals 1 if the target word appears within this set of K words; otherwise, it assumes a value of 0.

### TWEC

In [70]:
for key, value in tqdm(config_dict_cade.items()):
    for k in cutoffs:
        score_key = f"mp@{k}"
        scores_cade[key][score_key] = round(test_util.calculate_rank_metric(value[0], value[1], metric='MP', k=k), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

### Static Word2Vec

In [71]:
for key, value in tqdm(config_dict_static.items()):
    for k in cutoffs:
        score_key = f"mp@{k}"
        scores_static[key][score_key] = round(test_util.calculate_rank_metric_static(value[0], value[1], metric='MP', k=k), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

### TPPMI

In [72]:
for key, value in tqdm(config_dict_tppmi.items()):
    for k in cutoffs:
        score_key = f"mp@{k}"
        scores_tppmi[key][score_key] = round(test_util.calculate_rank_metric(value[0], value[1], metric='MP', k=k), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

## Results

In [73]:
score_table_cade = pd.DataFrame(scores_cade).T
score_table_static = pd.DataFrame(scores_static).T
score_table_tppmi = pd.DataFrame(scores_tppmi).T
print("Scores of TWEC")
display(score_table_cade)
print("Scores of TPPMI (2000 context-words)")
display(score_table_tppmi)
print("Scores of Static Word2Vec (Baseline)")
display(score_table_static)

Scores of TWEC


Unnamed: 0,mrr@10,mp@1,mp@3,mp@5,mp@10
static,0.611,0.532,0.669,0.721,0.766
dynamic,0.319,0.251,0.363,0.41,0.465
all,0.382,0.314,0.429,0.475,0.524


Scores of TPPMI (2000 context-words)


Unnamed: 0,mrr@10,mp@1,mp@3,mp@5,mp@10
static,0.538,0.435,0.616,0.676,0.739
dynamic,0.212,0.145,0.251,0.301,0.373
all,0.288,0.212,0.338,0.39,0.455


Scores of Static Word2Vec (Baseline)


Unnamed: 0,mrr@10,mp@1,mp@3,mp@5,mp@10
static,1.0,1.0,1.0,1.0,1.0
dynamic,0.158,0.0,0.365,0.427,0.464
all,0.439,0.333,0.577,0.618,0.643


In [74]:
score_table_cade['Model'] = 'TWEC'
score_table_tppmi['Model'] = 'TPPMI (2000 context-words)'
score_table_static['Model'] = 'Static Word2Vec (Baseline)'

merged_score_table = pd.concat([score_table_cade, score_table_tppmi, score_table_static], ignore_index=False)

merged_score_table.set_index(['Model', merged_score_table.index], inplace=True)
model_order = ['TWEC', 'TPPMI (2000 context-words)', 'Static Word2Vec (Baseline)']
merged_score_table = merged_score_table.reindex(model_order, level='Model')
merged_score_table = merged_score_table.round(3)

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

merged_score_table

Unnamed: 0_level_0,Unnamed: 1_level_0,mrr@10,mp@1,mp@3,mp@5,mp@10
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TWEC,static,0.611,0.532,0.669,0.721,0.766
TWEC,dynamic,0.319,0.251,0.363,0.41,0.465
TWEC,all,0.382,0.314,0.429,0.475,0.524
TPPMI (2000 context-words),static,0.538,0.435,0.616,0.676,0.739
TPPMI (2000 context-words),dynamic,0.212,0.145,0.251,0.301,0.373
TPPMI (2000 context-words),all,0.288,0.212,0.338,0.39,0.455
Static Word2Vec (Baseline),static,1.0,1.0,1.0,1.0,1.0
Static Word2Vec (Baseline),dynamic,0.158,0.0,0.365,0.427,0.464
Static Word2Vec (Baseline),all,0.439,0.333,0.577,0.618,0.643


----------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------

In [75]:
score_table_dir = Path("../../data/results/filtered_with_pantheon_dataset")
score_table_dir.mkdir(parents=True, exist_ok=True)

# Saving the CADE score table to CSV
score_table_cade.to_csv(score_table_dir / "score_table_cade.csv", index=True)

# Saving the TPPMI score table to CSV
score_table_tppmi.to_csv(score_table_dir / 'score_table_tppmi_2000.csv', index=True)

# Saving the Static Word2Vec (Baseline) score table to CSV
score_table_static.to_csv(score_table_dir / 'score_table_static.csv', index=True)

# Load scores from memory

In [None]:
score_tables = test_util.load_score_tables(score_table_dir)

In [None]:
score_tables.keys()

In [None]:
for name, score_table in score_tables.items():
    if "500" in name:
        continue
    print(f"Scores for the model: {name.split('table_')[-1].capitalize()}")
    display(score_table)