In [33]:
import fasttext
import nltk
import pandas as pd
import numpy as np

In [3]:
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonkoehl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
stopwords_german = stopwords.words("german")

In [11]:
def get_models(path1, path2):
    model_src = fasttext.load_model(path1)
    model_tgt = fasttext.load_model(path2)
    return (model_src, model_tgt)

In [12]:
model_afd, model_gruene = get_models("../models/fasttext/afd_with_stopwords.bin", "../models/fasttext/gruene_with_stopwords.bin")

In [49]:
#Splits model vectors into the stopword vectors and the rest
def split_model_vectors(model, stops):
    words = model.get_words()
    stop_df = pd.DataFrame(columns = ["word", "vector"])
    other_df = pd.DataFrame(columns = ["word", "vector"])
    for word in words:
        if word in stops:
            stop_df.loc[len(stop_df)] = [word, model.get_word_vector(word)]
        else:
            other_df.loc[len(other_df)] = [word, model.get_word_vector(word)]
    return stop_df, other_df

In [50]:
afd_stop, afd_other = split_model_vectors_df(model_afd, stopwords_german)
gruene_stop, gruene_other = split_model_vectors(model_gruene, stopwords_german)

In [71]:
def find_correspondences(df1, df2):
    stop_df1 = df1.copy()
    stop_df2 = df2.copy()
    words1 = set(stop_df1["word"])
    words2 = set(stop_df2["word"])
    common = list(words1 & words2)
    stop_df1 = stop_df1[stop_df1["word"].isin(common)]
    stop_df2 = stop_df2[stop_df2["word"].isin(common)]
    stop_df1 = stop_df1.sort_values("word")
    stop_df2 = stop_df2.sort_values("word")
    return stop_df1, stop_df2

In [74]:
afd_corr, gruene_corr = find_correspondences(afd_stop, gruene_stop)

In [153]:
from scipy.linalg import orthogonal_procrustes
#SOURCE: CHATGPT (slightly altered and fixed), 
#PROMPT: How would I align two point clouds with a different number of points
#        using orthogonal procrustes, given I know some corresponding points?
def align(df1_corr, df2_corr, df1_other, df2_other):
    P_corr = np.array(df1_corr["vector"].tolist())
    Q_corr = np.array(df2_corr["vector"].tolist())
    
    P_other = np.array(df1_other["vector"].tolist())
    Q_other = np.array(df2_other["vector"].tolist())
    # Step 1: Compute centroids of corresponding points
    centroid_P_corr = np.mean(P_corr, axis=0)
    centroid_Q_corr = np.mean(Q_corr, axis=0)

    # Step 2: Center the corresponding points
    P_corr_centered = P_corr - centroid_P_corr
    Q_corr_centered = Q_corr - centroid_Q_corr
    
    # Step 3: Compute the optimal rotation using orthogonal Procrustes
    R, tr = orthogonal_procrustes(Q_corr_centered, P_corr_centered)
    # Step 4: Apply rotation to the other point cloud Q_other
    Q_other_centered = Q_other - np.mean(Q_other, axis=0)
    Q_other_rotated = Q_other_centered @ R
    
    # Step 5: Compute the translation vector for the other point cloud
    translation_vector = centroid_P_corr - np.mean(Q_other_rotated, axis=0)

    # Step 6: Apply the translation to align the point cloud
    Q_other_aligned = Q_other_rotated + translation_vector
    
    #Step 7: Update DataFrame
    for i in range(len(df2_other)):
        df2_other.loc[i]["vector"] = Q_other_aligned[i,:]
    
    return df1_other, df2_other

In [154]:
afd_vecs, gruene_vecs = align(afd_corr, gruene_corr, afd_other, gruene_other)

In [155]:
#CUSTOM CLASS TO PUT THE ALIGNED VECTORS INTO A MODEL WHICH IMPLEMENTS THE FASTTEXT METHODS
class AlignedModel:
    def __init__(self, df):
        self.emb = np.array(df["vector"].tolist())
        self.vocab = {}
        for word in df["word"].tolist():
            self.vocab[word] = len(self.vocab)
        print("Number of words in vocab:",len(self.vocab))
        self.word_count = len(self.vocab)
        self.inverse_vocab = {v: k for k, v in self.vocab.items()}
        self.emb = self.emb / np.linalg.norm(self.emb, axis=1, keepdims=True)
    def get_word_vector(self, word):
        return self.emb[self.vocab[word],:]
    def cos_similarity(self, v1, v2):
        cos_sim = (v1 @ v2.T) / (np.linalg.norm(v1)*np.linalg.norm(v2))
        return cos_sim
    def compare(self, word1, word2):
        v1 = self.get_word_vector(word1)
        v2 = self.get_word_vector(word2)
        return self.cos_similarity(v1,v2)
    def get_nearest_neighbors(self, word, topn=10):
        word_idx = self.vocab[word]
        denominator = self.emb@(self.emb[word_idx,:])
        similarities = denominator
        topk = np.argsort(similarities)[-topn-1:-1][::-1]
        for i in topk:
            print(f"{self.inverse_vocab[i]}: {similarities[i]}")
    def get_nearest_vectors(self, v, topn=10, exclude=None):
        v_norm = np.linalg.norm(v)
        similarities = (self.emb@v)/v_norm
        topk = np.argsort(similarities)[::-1]
        k = 0
        q = 0
        while q < topn:
            i = topk[k]
            if exclude == None:
                print(f"{self.inverse_vocab[i]}: {similarities[i]}")
                q+=1
            elif i not in exclude:
                print(f"{self.inverse_vocab[i]}: {similarities[i]}")
                q+=1
            k+=1
                
    def get_analogies(self, w1, w2, w3, topn=10):
        v1 = self.get_word_vector(w1)
        v2 = self.get_word_vector(w2)
        v3 = self.get_word_vector(w3)
        self.get_nearest_vectors(v1-v2+v3, topn, exclude=[self.vocab[w1], self.vocab[w2], self.vocab[w3]])

In [156]:
a_m1 = AlignedModel(afd_vecs)
a_m2 = AlignedModel(gruene_vecs)

Number of words in vocab: 6314
Number of words in vocab: 5335


In [160]:
a_m1.get_analogies("putin", "russland", "frankreich")

franzose: 0.5928472876548767
französisch: 0.5101040601730347
macron: 0.4922094941139221
italien: 0.48235827684402466
brennen: 0.4498916268348694
deutsche: 0.4343798756599426
italiener: 0.4306807816028595
vorig: 0.4265478253364563
nachbarland: 0.41570010781288147
polen: 0.4084967076778412


In [161]:
a_m2.get_analogies("putin", "russland", "frankreich")

europa: 0.5466688275337219
macron: 0.5118693113327026
europäisch: 0.46210089325904846
französisch: 0.4606568217277527
nachbar: 0.4326895475387573
blockieren: 0.43004122376441956
schweden: 0.426599383354187
gedanke: 0.409820556640625
währungsunion: 0.39824947714805603
deutsch_französisch: 0.3958217203617096


In [117]:
def compute_biggest_shift(m1_aligned, m2_aligned, common_vocab):
    shifts = []
    for word in common_vocab:
        v1 = m1_aligned.get_word_vector(word)
        v2 = m2_aligned.get_word_vector(word)
        dist = np.linalg.norm(v1-v2)
        shifts.append((word, dist))
    sorted_by_dist = sorted(shifts, key=lambda tup: tup[1], reverse=True)
    print(sorted_by_dist[:100])

In [122]:
compute_biggest_shift(a_m1, a_m2, list(set(afd_vecs["word"]) & set(gruene_vecs["word"])))

[('wiederaufbau', 1.6063725), ('dürr', 1.5907849), ('italien', 1.5822355), ('tendenz', 1.5726488), ('anleihe', 1.5722544), ('familiennachzug', 1.571683), ('ausspielen', 1.5703784), ('verzögerung', 1.5670469), ('frauenhäuser', 1.566237), ('bewältigung', 1.5637577), ('nachfolgend', 1.5620725), ('finanzpolitik', 1.5603054), ('tauchen', 1.5551889), ('kräftig', 1.5545888), ('gefährdet', 1.5541551), ('kanada', 1.5527774), ('serbien', 1.5510054), ('versehen', 1.5508277), ('auseinander', 1.5502356), ('vermittlungsausschuss', 1.5491769), ('fachkräftemangel', 1.5463183), ('staatsbürger', 1.5457841), ('ewig', 1.5441307), ('nachdruck', 1.543524), ('prüfstand', 1.542583), ('ehrenamtlich', 1.5418328), ('bundesfinanzminister', 1.5385687), ('beirat', 1.538407), ('normalerweise', 1.5375516), ('übersehen', 1.5364621), ('präventiv', 1.5353938), ('bundesprogramm', 1.5352918), ('stellenwert', 1.5336287), ('verhandlung', 1.5335323), ('applaus', 1.5312594), ('erzieher', 1.5298506), ('seele', 1.5297663), ('au