In [1]:
from scipy.spatial import procrustes
from scipy.linalg import orthogonal_procrustes
import fasttext
import numpy as np

In [2]:
def get_models(path1, path2):
    model_src = fasttext.load_model(path1)
    model_tgt = fasttext.load_model(path2)
    return (model_src, model_tgt)

In [64]:
def get_common_word_vecs(model1, model2):
    words1 = model1.get_words()
    words2 = model2.get_words()
    print("südafrika" in words1)
    print("südafrika" in words2)
    print(len(words2))
    m1_original_vectors = np.array([model1.get_word_vector(word) for word in words1])
    m2_original_vectors = np.array([model2.get_word_vector(word) for word in words2])
    common = list(set(words1) & set(words2))
    vocab_word_to_index_ = {}
    m1_common_vec = []
    m2_common_vec = []
    for i in range(len(common)):
        word = common[i]
        vocab_word_to_index_[word] = i
        v1 = model1.get_word_vector(word)
        v2 = model2.get_word_vector(word)
        v1 = v1/np.linalg.norm(v1)
        v2 = v2/np.linalg.norm(v2)
        m1_common_vec.append(v1)
        m2_common_vec.append(v2)
    
    m1_vec = np.array(m1_common_vec)
    m2_vec = np.array(m2_common_vec)

    #fetch missing
    missing_vocab_m1 = {}
    missing_vocab_m2 = {}
    for i in range(len(words1)):
        w = words1[i]
        if w not in common:
            missing_vocab_m1[w] = model1.get_word_vector(w)#- m1_mean
    for i in range(len(words2)):
        w = words2[i]
        if w not in common:
            missing_vocab_m2[w] = model2.get_word_vector(w)# - m2_mean
            
    return (vocab_word_to_index_, m1_vec, m2_vec, missing_vocab_m1, missing_vocab_m2, m1_original_vectors, m2_original_vectors)

In [65]:
vocab, vecs1, vecs2, missing_m1, missing_m2, point_cloud1, point_cloud2 = get_common_word_vecs(m1,m2)
print(vecs1.shape, vecs2.shape)

True
False
6895
(5150, 400) (5150, 400)


KeyError: 'südafrika'

In [86]:
class AlignedModel:
    def __init__(self, vocab_to_index, emb):
        self.emb = emb
        self.vocab = vocab_to_index.copy()
        print("Number of words in vocab:",len(self.vocab))
        self.word_count = len(self.vocab)
        self.inverse_vocab = {v: k for k, v in self.vocab.items()}
        self.emb = self.emb / np.linalg.norm(self.emb, axis=1, keepdims=True)
    def get_word_vector(self, word):
        return self.emb[self.vocab[word],:]
    def cos_similarity(self, v1, v2):
        cos_sim = (v1 @ v2.T) / (np.linalg.norm(v1)*np.linalg.norm(v2))
        return cos_sim
    def compare(self, word1, word2):
        v1 = self.get_word_vector(word1)
        v2 = self.get_word_vector(word2)
        return self.cos_similarity(v1,v2)
    def get_nearest_neighbors(self, word, topn=10):
        word_idx = self.vocab[word]
        denominator = self.emb@(self.emb[word_idx,:])
        similarities = denominator
        topk = np.argsort(similarities)[-topn-1:-1][::-1]
        for i in topk:
            print(f"{self.inverse_vocab[i]}: {similarities[i]}")
    def get_nearest_vectors(self, v, topn=10, exclude=None):
        v_norm = np.linalg.norm(v)
        similarities = (self.emb@v)/v_norm
        topk = np.argsort(similarities)[::-1]
        k = 0
        q = 0
        while q < topn:
            i = topk[k]
            if exclude == None:
                print(f"{self.inverse_vocab[i]}: {similarities[i]}")
                q+=1
            elif i not in exclude:
                print(f"{self.inverse_vocab[i]}: {similarities[i]}")
                q+=1
            k+=1
                
    def get_analogies(self, w1, w2, w3, topn=10):
        v1 = self.get_word_vector(w1)
        v2 = self.get_word_vector(w2)
        v3 = self.get_word_vector(w3)
        self.get_nearest_vectors(v1-v2+v3, topn, exclude=[self.vocab[w1], self.vocab[w2], self.vocab[w3]])

In [78]:
#SOURCE: https://fasttext.cc/docs/en/english-vectors.html, last accessed 12.08.2024, 14:11

import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    vocab = {}
    word_vectors = []
    for line in fin:
        tokens = line.rstrip().split(' ')
        vocab[tokens[0]] = len(word_vectors)
        word_vectors.append(np.array(list(map(float, tokens[1:]))))
    return vocab, word_vectors

In [84]:
model_1_vocab, model_1_vectors = load_vectors("../models/fasttext/afd_aligned.vec")
model_2_vocab, model_2_vectors = load_vectors("../models/fasttext/gruene_aligned.vec")

In [87]:
a_m1 = AlignedModel(model_1_vocab, model_1_vectors)
a_m2 = AlignedModel(model_2_vocab, model_2_vectors)

Number of words in vocab: 6438
Number of words in vocab: 5437


In [88]:
a_m1.get_nearest_neighbors("afd")

fraktion: 0.5817636921653645
antrag: 0.47407956576005655
fordern: 0.44713797983737236
fdp: 0.432479833810955
partei: 0.41127083210685245
deshalb: 0.4111341697546953
grüne: 0.4052617215088661
bundestag: 0.39777133935881104
dankherr: 0.3975764295314783
csu: 0.3937704865961197


In [89]:
a_m1.get_nearest_neighbors("feministisch", topn=10)

außenpolitik: 0.4689091647115785
außenpolitisch: 0.3710191267976566
kommunistisch: 0.32110881006017056
ministerin: 0.2994605379235993
außen: 0.2942367380445296
außenministerin: 0.2925523724773291
frau: 0.26897802038681135
entwicklungspolitik: 0.2588735199236723
weiblich: 0.2540375700436837
ahrtal: 0.23183078892176487


In [90]:
a_m1.get_nearest_neighbors("südafrika", topn=15)

südamerika: 0.3986828767570639
afrika: 0.36700290884341097
indonesien: 0.35343059425236933
pakistan: 0.3173756286153637
china: 0.2936060652138359
indien: 0.2845339783813964
peking: 0.23980055147073542
südsudan: 0.22676449175820879
sudan: 0.22517859672858548
rücktritt: 0.22285517480691183
entwicklungszusammenarbeit: 0.22179926412531464
entwickeln: 0.21798652598858312
omikron: 0.21364647025788047
mächt: 0.20339875011985026
erwähn: 0.20218218286446207


In [91]:
a_m1.compare("entwicklungszusammenarbeit", "südafrika")

0.22179926412531467

In [92]:
a_m1.compare("klima", "umwelt")

0.24342163413121998

In [93]:
a_m1.get_analogies("putin", "russland", "frankreich")

macron: 0.31091448033427965
italien: 0.27978522151493856
franzose: 0.2550854240390471
französisch: 0.2351356888421584
demokrat: 0.23095543768981955
spanien: 0.21943376109882895
luxemburg: 0.21351960728995628
tschechien: 0.21341944697887916
chefin: 0.20001103990662414
italiener: 0.19348991105975624


In [94]:
a_m2.get_nearest_neighbors("islam")

islamisch: 0.6720314073843481
islamist: 0.6378685565315334
islamismus: 0.5587765477933615
islamistisch: 0.4890151350066564
muslim: 0.4303569514532919
muslimisch: 0.3682572324943644
religion: 0.32053790882665706
religiös: 0.2917980446082167
religionsfreiheit: 0.2701279277808758
verfassungsfeindlich: 0.26352161033878385


In [100]:
def compute_biggest_shift(m1_aligned, m2_aligned, common_vocab):
    shifts = []
    for word in common_vocab:
        v1 = m1_aligned.get_word_vector(word)
        v2 = m2_aligned.get_word_vector(word)
        dist = np.linalg.norm(v1-v2)
        shifts.append((word, dist))
    sorted_by_dist = sorted(shifts, key=lambda tup: tup[1], reverse=True)
    print(sorted_by_dist[:100])

In [101]:
compute_biggest_shift(a_m1, a_m2, list(set(model_1_vocab.keys()) & set(model_2_vocab.keys())))

[('suggerieren', 1.5235837469734292), ('enteignung', 1.5188206343923079), ('martin', 1.5181380980467045), ('haustür', 1.5163230860263281), ('ad', 1.514646546537704), ('widerstand', 1.5057987761789844), ('atmen', 1.5012036997032971), ('krone', 1.5010254989904843), ('korrekt', 1.498063182671778), ('wichtigste', 1.488169900401477), ('regelrecht', 1.487819571854431), ('inklusive', 1.485513928934004), ('reißen', 1.4849460382154651), ('gott', 1.4834750103592476), ('alternativ', 1.4828793464818693), ('beamter', 1.4818769375761878), ('faktor', 1.4816897550372639), ('dankbar', 1.481432459882718), ('theorie', 1.481261443282041), ('rücken', 1.4806454064243777), ('eröffnen', 1.4802424885805805), ('zurückziehen', 1.4799199754743557), ('ford', 1.4797530786599373), ('ausdrücken', 1.4792891326553406), ('auslösen', 1.4785842325342302), ('einigkeit', 1.4776308931255284), ('begrüß', 1.4773499293791235), ('eins', 1.477219305051788), ('geplant', 1.4770096043444105), ('gemacht', 1.4747953332205477), ('freiw

In [443]:
a_m1.get_nearest_neighbors("erfolg")

tun: 0.3081129789352417
bundesregierung: 0.29253244400024414
seite: 0.28401291370391846
sprechen: 0.27862846851348877
wirtschaftlich: 0.2744219899177551
der: 0.2739515006542206
sollen: 0.2710886001586914
geben: 0.27041247487068176
politisch: 0.2701123356819153
frau: 0.2672087550163269


In [444]:
a_m2.get_nearest_neighbors("erfolg")

neu: 0.2567260265350342
groß: 0.23197662830352783
jahr: 0.2203635722398758
besonderer: 0.2193661630153656
letzter: 0.2162303626537323
stehen: 0.21001777052879333
erreichen: 0.20685796439647675
sehen: 0.20183128118515015
ganz: 0.2011057436466217
darauf: 0.19987910985946655


In [383]:
a_m1.get_nearest_neighbors("irrsinn")
a_m2.get_nearest_neighbors("irrsinn")

irrsinnig: 0.5054561692888472
welch: 0.22729446882487508
windindustrieanlag: 0.2134466379973569
vernichten: 0.20540815538084337
welcher: 0.20159260125297307
wahnsinn: 0.20051785546261558
kohleausstieg: 0.1969076098693952
energiepolitik: 0.19562326384339365
schädig: 0.19514797618465315
grün: 0.19347625696535914
irr: 0.36627251803178107
tagebau: 0.26502418882226064
unsinn: 0.2354000918803872
atomwaffe: 0.2347204401509786
windkraftanlage: 0.22886239946166592
mitwirken: 0.22839863974371388
neubau: 0.22155908948324157
irre: 0.22100863644437257
windkraft: 0.2200186408827055
zerstörung: 0.21694000969569632


In [380]:
test2 = AlignedModel(vocab_full_2, vecs2, zero_range_2)

Number of words in vocab: 5437
range(4456, 6438)
(5437, 300)


In [381]:
test2.get_nearest_neighbors("hanau")

halle: 0.5601162910461426
lübcke: 0.4330243468284607
anschlag: 0.4186391234397888
breitscheidplatz: 0.3907088339328766
nsu: 0.3698315918445587
rassistisch: 0.3676041066646576
opfer: 0.3576444685459137
walter: 0.35632702708244324
mord: 0.3559960722923279
synagoge: 0.3415311574935913
