In [1]:
from gensim.models import Word2Vec
import gensim
#from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from numpy.linalg import svd
import numpy as np
from tqdm import tqdm

In [9]:
class SentenceIterator:
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        with open(self.filename, 'r') as f:
            for line in f:
                yield line.strip("\n").split(" ")

In [16]:
iterator_1 = SentenceIterator("../semeval2020_ulscd_swe/corpus1/lemma/kubhist2a.txt")
iterator_2 = SentenceIterator("../semeval2020_ulscd_swe/corpus2/lemma/kubhist2b.txt")

In [11]:
model_1 = gensim.models.Word2Vec(iterator_1, window=5)

In [22]:
model_1.save("../models/model_old.model")

In [17]:
model_2 = gensim.models.Word2Vec(iterator_2, window=5)

In [23]:
model_2.save("../models/model_new.model")

In [43]:
#Load the models here if you already have computed embeddings and saved the models
#model_1 = Word2Vec.load("../models/model_old.model")
#model_2 = Word2Vec.load("../models/model_new.model")

In [24]:
common_words = list(set(model_1.wv.key_to_index.keys()).intersection(set(model_2.wv.key_to_index.keys())))

In [25]:
# Align matrices using the orthogonal procrustes procedure
def align_matrices(A, B):
    u, sigma, vT = svd(np.matmul(A, B.transpose()), full_matrices=False)
    R = np.matmul(vT.transpose(), u.transpose())
    A_new = np.matmul(R, A)
    return A_new, B

def align_embeddings(emb_1, emb_2):
    emb1_new, _ = align_matrices(emb_1.transpose(),emb_2.transpose())
    return emb1_new.transpose(), emb_2

In [26]:
common_embs_1 = np.array([model_1.wv[word] for word in common_words])
common_embs_2 = np.array([model_2.wv[word] for word in common_words])

In [27]:
rotated_vectors, _ = align_embeddings(common_embs_1, common_embs_2)

In [28]:
emb_dict_1 = {}
emb_dict_2 = {}
for i, word in enumerate(common_words):
    emb_dict_1[word] = rotated_vectors[i]
    emb_dict_2[word] = common_embs_2[i]

In [29]:
similarities = {}
for word in common_words:
    similarities[word] = cosine_similarity(emb_dict_1[word].reshape(1,-1),emb_dict_2[word].reshape(1,-1))[0][0]

In [30]:
sorted_by_similarity = sorted(similarities.items(), key = lambda x : x[1])

In [36]:
sorted_by_similarity[-1]

('icke', 0.91102165)

In [33]:
def compare_most_similar(model_1, model_2, word):
    most_similar_1 = model_1.wv.most_similar(word)
    most_similar_2 = model_2.wv.most_similar(word)
    print("\t Model 1 \t\tModel 2")
    for i in range(min(len(most_similar_1),len(most_similar_2))):
        print(f"{i}\t{most_similar_1[i][0]}: {most_similar_1[i][1]:.3f}\t\t{most_similar_2[i][0]}: {most_similar_2[i][1]:.3f}")

In [41]:
compare_most_similar(model_1, model_2, "göteborg")

	 Model 1 		Model 2
0	malmö: 0.876		stockholm: 0.856
1	warberg: 0.837		malmö: 0.790
2	carlskrona: 0.829		slockholm: 0.786
3	marstrand: 0.819		norrköping: 0.771
4	kalmar: 0.818		helsingborg: 0.756
5	wisby: 0.817		linköping: 0.754
6	strömstad: 0.805		karlstad: 0.751
7	landskrona: 0.805		uddevalla: 0.744
8	uddewalla: 0.802		köpenhamn: 0.731
9	calmar: 0.799		hälsingborg: 0.721
