In [None]:
import pickle
import pandas as pd
from glob import glob
from pathlib import Path
from scipy import spatial
import scipy.sparse as sp
from tqdm.notebook import tqdm
from gensim.models.word2vec import Word2Vec

import sys
sys.path.append('../../')

from src.packages.TPPMI.ppmi_model import PPMIModel
from src.packages.TPPMI.tppmi_model import TPPMIModel

# Imports

In [None]:
path_to_test_data = Path("../../data") / "test"
path_to_tppmi_model = Path("../../data") / "ppmi-matrices" / "nyt-data"
path_to_twec_model = Path("../../model") / "nyt-data" / "cade" / "model"

## Testsets

### Testset 1

Based on publicly recorded knowledge that for each year lists different names for a particular role, such as U.S. president, U.K. prime minister, NFL superbowl champion team, and so on.

In [None]:
test_data_1 = pd.read_csv(path_to_test_data / "testset_1.csv")

In [None]:
test_data_1.columns = ['truth', 'equivalent']

In [None]:
test_cases_1 = test_data_1['truth'].unique()

In [None]:
print(f"Testset1 contains {len(test_cases_1)} test cases")

### Testset 2

Testset 2 is human-generated, for exploring more interesting concepts like emerging technologies, brands and major events (e.g., disease outbreaks and financial crisis). For constructing the test word pairs, we first select emerging terms which have not been popularized before 1994, then query their well known precedents during 1990 to 1994 (e.g., app-2012 can correspond to software-1990).

In [None]:
test_data_2 = pd.read_csv(path_to_test_data / "testset_2.csv")

In [None]:
test_data_2.columns = ['truth', 'equivalent']

In [None]:
test_data_2.head()

## Models

### TWEC

In [None]:
cade_model_filenames = glob(str(path_to_twec_model / "*.model"))

In [None]:
# load models
cade_models = {f"model_{model_file.split('_data')[0][-4:]}":Word2Vec.load(model_file) for model_file in tqdm(cade_model_filenames)}

In [None]:
cade_models.keys()

#### Testset 1

In [None]:
test_case_dict_1 = dict()
counter = 0

In [None]:
len(test_cases_1)

In [None]:
for test_case in test_cases_1:
    word, year = test_case.split("-")
    ground_model = cade_models[f"model_{year}"]
    if word in ground_model.wv.vocab:
        test_case_dict_1[test_case] = ground_model.wv.get_vector(word)
    else:
        counter = counter + 1
print(counter)

In [None]:
test_case_dict_1

In [None]:
cade_model = cade_models[next(iter(cade_models))]

In [None]:
def get_similarities_of_model(model, test_word, top_n = 10):
    # Compute cosine similarity between specified embedding and all embeddings in the model
    test_word_embedding = test_word[1]
    test_word_key = test_word[0]
    word_similarities = dict()
    for reference_word in model.wv.vocab:
        reference_word_embedding = model.wv[reference_word]
        similarity = 1 - spatial.distance.cosine(test_word_embedding, reference_word_embedding)
        word_similarities[word] = similarity

    # Sort words by similarity
    sorted_similarities = sorted(word_similarities.items(), key=lambda item: item[1], reverse=True)

    # Get top_n similar words
    return sorted_similarities[:top_n]

In [None]:
def get_similarities_of_models(model_dict: dict, test_word_dict: dict):
    similarities = dict()
    for test_word in test_word_dict.items():
        similarities[test_word[0]] = dict()
        '''print(test_word)
        print(type(test_word))'''
        for model in model_dict.items():
            '''print(model)
            print(type(model))
            print(model[0])'''
            similarities[test_word[0]][model[0].split("_")[1]] = get_similarities_of_model(model[1], test_word)
    return similarities

In [None]:
similarities = get_similarities_of_models(cade_models, test_case_dict_1)

In [None]:
cade_models

In [None]:
get_similarities_of_models(cade_models, test_case_dict_1)

In [None]:
# Compute cosine similarity between specified embedding and all embeddings in the model
word_similarities = {}
for word in cade_model.wv.vocab:
    word_embedding = cade_model.wv[word]
    similarity = 1 - spatial.distance.cosine(test_case_dict_1[next(iter(test_case_dict_1))], word_embedding)
    word_similarities[word] = similarity

# Sort words by similarity
sorted_similarities = sorted(word_similarities.items(), key=lambda item: item[1], reverse=True)

# Get top N similar words
top_n = 10  # or any number you prefer
most_similar_words = sorted_similarities[:top_n]

for word, similarity in most_similar_words:
    print(f"{word}: {similarity}")

In [None]:
cade_model.wv.most_similar()

### TPPMI

In [None]:
ppmi_data_files = sorted(glob(str(path_to_tppmi_model  / "*.npz")))
words_files = sorted(glob(str(path_to_tppmi_model  / "*.pkl")))

Split context-words from timestamped-vocabularies

In [None]:
context_words_file = [path for path in words_files if "context-words" in path]
ppmi_vocab_files = [path for path in words_files if "context-words" not in path]

In [None]:
# Get ppmi-matrices and vocab
ppmi_matrices = {}

for filenames in zip(ppmi_vocab_files, ppmi_data_files):
    ppmi_matrix = sp.load_npz(filenames[1])
    with open(filenames[0], "rb") as f:
        vocab = pickle.load(f)
    key = filenames[0].split("ppmi-")[2][0:4]
    ppmi_matrices[key] = {"ppmi_matrix" : ppmi_matrix, "vocab": vocab}

# Get common context-words
with open(context_words_file[0], "rb") as f:
    context_words = pickle.load(f)

In [None]:
ppmi_matrices.keys()

Create ppmi_model objects

In [None]:
ppmi_models = {key: PPMIModel.construct_from_data(ppmi_data["ppmi_matrix"], ppmi_data["vocab"], context_words) for key, ppmi_data in ppmi_matrices.items()}

In [None]:
tppmi_model = TPPMIModel(ppmi_models, dates="years")

# Evaluation Method

## MRR (Mean Reciprocal Rank)

# Experiment

To examine the quality of embedding alignment, we create a task to query equivalences across years.

For example, given obama-2012, we want to query its equivalent word in 2002. As we know obama is the U.S. president in 2012; its equivalent in 2002 is bush, who was the U.S. president at that time. In this way, we create two testsets

In [None]:
test_data_1.head()