In [1]:
from gensim.models import Word2Vec
import gensim
#from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from numpy.linalg import svd
import numpy as np
from tqdm import tqdm

In [9]:
class SentenceIterator:
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        with open(self.filename, 'r') as f:
            for i, line in enumerate(f):
                yield line.strip("\n").split(" ")
                if i == 100000:
                    break

In [16]:
iterator_1 = SentenceIterator("../semeval2020_ulscd_swe/corpus1/lemma/kubhist2a.txt")
iterator_2 = SentenceIterator("../semeval2020_ulscd_swe/corpus2/lemma/kubhist2b.txt")

In [11]:
model_1 = gensim.models.Word2Vec(iterator_1, window=5)

In [22]:
model_1.save("../models/model_old.model")

In [17]:
model_2 = gensim.models.Word2Vec(iterator_2, window=5)

In [23]:
model_2.save("../models/model_new.model")

In [43]:
#Load the models here if you already have computed embeddings and saved the models
#model_1 = Word2Vec.load("../models/model_old.model")
#model_2 = Word2Vec.load("../models/model_new.model")

In [24]:
common_words = list(set(model_1.wv.key_to_index.keys()).intersection(set(model_2.wv.key_to_index.keys())))

In [25]:
# Align matrices using the orthogonal procrustes procedure
def align_matrices(A, B):
    u, sigma, vT = svd(np.matmul(A, B.transpose()), full_matrices=False)
    R = np.matmul(vT.transpose(), u.transpose())
    A_new = np.matmul(R, A)
    return A_new, B

def align_embeddings(emb_1, emb_2):
    emb1_new, _ = align_matrices(emb_1.transpose(),emb_2.transpose())
    return emb1_new.transpose(), emb_2

In [26]:
common_embs_1 = np.array([model_1.wv[word] for word in common_words])
common_embs_2 = np.array([model_2.wv[word] for word in common_words])

In [27]:
rotated_vectors, _ = align_embeddings(common_embs_1, common_embs_2)

In [28]:
emb_dict_1 = {}
emb_dict_2 = {}
for i, word in enumerate(common_words):
    emb_dict_1[word] = rotated_vectors[i]
    emb_dict_2[word] = common_embs_2[i]

In [29]:
similarities = {}
for word in common_words:
    similarities[word] = cosine_similarity(emb_dict_1[word].reshape(1,-1),emb_dict_2[word].reshape(1,-1))[0][0]

In [30]:
sorted_by_similarity = sorted(similarities.items(), key = lambda x : x[1])

In [45]:
sorted_by_similarity[0:20]

[('stälia', -0.3606568),
 ('1319', -0.31230494),
 ('börre', -0.29665688),
 ('chans', -0.2856727),
 ('agia', -0.28157967),
 ('bemåla', -0.25456107),
 ('fattet', -0.25157046),
 ('golli', -0.24208164),
 ('tolli', -0.2381638),
 ('egga', -0.23287061),
 ('klädtorkning', -0.22133023),
 ('ssal', -0.22058086),
 ('iean', -0.20308968),
 ('källsta', -0.1919604),
 ('nymålning', -0.18984091),
 ('licence', -0.1897186),
 ('åfwen', -0.18690658),
 ('mohn', -0.1854572),
 ('ennis', -0.18218979),
 ('köna', -0.18187992)]

In [33]:
def compare_most_similar(model_1, model_2, word):
    most_similar_1 = model_1.wv.most_similar(word)
    most_similar_2 = model_2.wv.most_similar(word)
    print("\t Model 1 \t\tModel 2")
    for i in range(min(len(most_similar_1),len(most_similar_2))):
        print(f"{i}\t{most_similar_1[i][0]}: {most_similar_1[i][1]:.3f}\t\t{most_similar_2[i][0]}: {most_similar_2[i][1]:.3f}")

In [50]:
compare_most_similar(model_1, model_2, "stockholm")

	 Model 1 		Model 2
0	slockholm: 0.873		göteborg: 0.856
1	stockbolm: 0.821		slockholm: 0.822
2	götheborg: 0.789		norrköping: 0.812
3	norrköping: 0.782		örebro: 0.763
4	stockdolm: 0.781		malmö: 0.759
5	stockholn: 0.747		jönköping: 0.742
6	örebro: 0.738		upsala: 0.742
7	carlskrona: 0.731		helsingfors: 0.734
8	stockhol: 0.730		linköping: 0.732
9	jönköping: 0.714		helsingborg: 0.712
