# Semantic Change

## Schritte

- Lade Metadaten
- Lade Korpus
- Trainiere Modelle
- Vergleiche für alle Worte die 100 nächsten Nachbarn in Aufklaerung und Kunstepoche
    -> Siehe dafür Zeta-Heatmaps Code
- Mach ne csv mit allen Worten und Übereinstimmung in den beiden Modellen

## Metadaten und Corpus

In [1]:
import pandas as pd

meta = pd.read_csv("meta_drama.csv", encoding = "utf8")
meta.head()

Unnamed: 0.1,Unnamed: 0,id,title,author,period,type,genre,date,file,source,annotation,tokens_cleaned
0,0,1,Der sterbende Cato,Gottsched,Aufklaerung,Tragedy,drama,1731,gottsched-der-sterbende-cato.xml,https://github.com/dracor-org/gerdracor,,22047
1,1,2,Ein Deutsches Vorspiel,Neuber,Aufklaerung,,drama,1734,neuber-ein-deutsches-vorspiel.xml,https://github.com/dracor-org/gerdracor,,6480
2,2,3,Die Pietisterey im Fischbein-Rocke oder Die Do...,Gottsched,Aufklaerung,Comedy,drama,1736,gottschedin-die-pietisterey-im-fischbein-rocke...,https://github.com/dracor-org/gerdracor,,27691
3,3,4,Die von der Weisheit wider die Unwissenheit be...,Neuber,Aufklaerung,,drama,1736,neuber-die-beschuetzte-schauspielkunst.xml,https://github.com/dracor-org/gerdracor,,7377
4,4,5,Die Verehrung der Vollkommenheit durch die geb...,Neuber,Aufklaerung,,drama,1737,neuber-die-verehrung-der-vollkommenheit.xml,https://github.com/dracor-org/gerdracor,,7091


In [2]:
def load_corpus(path, df):
    from numpy import append 

    texts = []
    filenames = []
    for filename in df["file"]:
        with open(path + filename, 'r', encoding="utf8") as f:
            texts.append(f.read())

        filenames.append(filename)    
    return texts, filenames

In [3]:
def load_corpus_drama(path):
    from numpy import append 
    from os import listdir
    
    texts = []
    filenames = []
    for filename in listdir(path):
        with open(path + "/" + filename, 'r', encoding = "utf16") as f:
            texts.append(f.read())

        filenames.append(filename)    
    return texts, filenames

In [4]:
texts_aufklaerung, filenames_aufklaerung = load_corpus("corpora/cleaned_normalized/corpus_drama/", meta.loc[meta["period"]=="Aufklaerung"])
texts_kunstepoche, filenames_kunstepoche = load_corpus("corpora/cleaned_normalized/corpus_drama/", meta.loc[meta["period"]=="Kunstepoche"])
texts, filenames = load_corpus("corpora/cleaned_normalized/corpus_drama/", meta)

In [5]:
# texts, filenames = load_corpus_drama("corpora/cleaned_normalized/corpus_drama/")

In [6]:
# texts_aufklaerung = []
# texts_kunstepoche = []

# for i in range(len(texts)):
#     if "_aufklaerung" in filenames[i]:
#         texts_aufklaerung.append(texts[i])
#     else:
#         texts_kunstepoche.append(texts[i])

In [7]:
len(texts_aufklaerung)

117

In [8]:
texts_aufklaerung[0][:100]

'Johann Christoph Gottsched Cato Trauerspiel Widmung Memoriam Adolf Vogler -- -- -- -- Herr Verfasser'

In [9]:
len(texts_kunstepoche)

206

In [10]:
texts_kunstepoche[0][:100]

'November August Kotzebue Indianer England Lustspiel aufzüg Widmung Freund Hueck Reval Mäster Strusse'

In [11]:
len(texts)

323

In [12]:
def make_sent(texts):
    sent = []
    for text in texts:
        wordlist=text.split()
        sent_text = []
        for word in wordlist:
            sent_text.append(word.lower())
        sent.append(sent_text)
    return sent

In [13]:
sent_aufklaerung = make_sent(texts_aufklaerung)
sent_kunstepoche = make_sent(texts_kunstepoche)
sent = make_sent(texts)

In [14]:
sent_aufklaerung[0][:10]

['johann',
 'christoph',
 'gottsched',
 'cato',
 'trauerspiel',
 'widmung',
 'memoriam',
 'adolf',
 'vogler',
 '--']

In [15]:
sent_kunstepoche[0][:10]

['november',
 'august',
 'kotzebue',
 'indianer',
 'england',
 'lustspiel',
 'aufzüg',
 'widmung',
 'freund',
 'hueck']

## Basismodell und Hyperparameter

In [16]:
from gensim.models.phrases import Phrases, Phraser
import gensim.models
import multiprocessing

MIN_COUNT = 10
VECTOR_SIZE = 300
WINDOW = 5
ALPHA = 0.1
NEGATIVE = 20

In [17]:
def train_model(words1, words2, epochs1, epochs2):
    phrases = Phrases(words1, min_count=30, progress_per=10000) # detects multi word expressions

    bigram = Phraser(phrases)
    sentences1 = bigram[words1]
    
    model = gensim.models.FastText(min_count=MIN_COUNT,
                                   vector_size=VECTOR_SIZE,
                                   window = WINDOW,
                                   alpha=ALPHA,
                                   workers = multiprocessing.cpu_count()-1,
                                   negative = NEGATIVE,
                                   seed = 42)

    model.build_vocab(corpus_iterable=sentences1)

    model.train(corpus_iterable=sentences1,
                        epochs = epochs1,
                        total_examples=model.corpus_count)
    
    phrases = Phrases(words2, min_count=30, progress_per=10000) # detects multi word expressions

    bigram = Phraser(phrases)
    sentences2 = bigram[words2]
    
    model.build_vocab(corpus_iterable=sentences2,
                      update=True)
    
    model.train(corpus_iterable=sentences2,
                epochs = epochs2,
                total_examples=model.corpus_count)
    
    return model

In [18]:
model_aufklaerung = train_model(sent, sent_aufklaerung, 25, 25)

In [19]:
model_kunstepoche = train_model(sent, sent_kunstepoche, 25, 25)

In [20]:
model_aufklaerung.wv.most_similar(["königin"])

[('königinn', 0.5166581869125366),
 ('marquis', 0.355306476354599),
 ('könig', 0.3500777781009674),
 ('marqui', 0.3177785575389862),
 ('lerma', 0.31547293066978455),
 ('carlos', 0.30540555715560913),
 ('marquisin', 0.3021179437637329),
 ('königl', 0.3012202978134155),
 ('feenkönigin', 0.30045998096466064),
 ('olivarez', 0.2941695749759674)]

In [21]:
model_kunstepoche.wv.most_similar(["königin"])

[('könig', 0.5415931940078735),
 ('königinn', 0.4740905165672302),
 ('königl', 0.408631831407547),
 ('struensee', 0.3838570713996887),
 ('feenkönigin', 0.3764101564884186),
 ('eur_majestät', 0.37184467911720276),
 ('königlich', 0.3613232970237732),
 ('könige', 0.35222122073173523),
 ('graf_struensee', 0.3476984202861786),
 ('königs', 0.3442992568016052)]

In [22]:
len(model_aufklaerung.wv)

18618

In [23]:
len(model_kunstepoche.wv)

18615

In [24]:
model_aufklaerung.wv.index_to_key[0]

'--'

In [25]:
len(set(model_aufklaerung.wv.index_to_key).intersection(set(model_kunstepoche.wv.index_to_key)))

18614

## Update models

## get wordlist from gensim-output

In [26]:
wordlist_raw = model_kunstepoche.wv.most_similar(["königin"])

def make_wordlist_from_gensimoutput(gensim_output):
    wordlist = []

    for word in gensim_output:
        wordlist.append(word[0])

    return wordlist

make_wordlist_from_gensimoutput(wordlist_raw)

['könig',
 'königinn',
 'königl',
 'struensee',
 'feenkönigin',
 'eur_majestät',
 'königlich',
 'könige',
 'graf_struensee',
 'königs']

## Semantic Change

In [27]:
def calculate_difference(list_one, list_two):
    length_list_one = len(list_one)
    
    result = set(list_one).intersection(set(list_two))
    number_of_same_elements = len(result)
    
    return number_of_same_elements/length_list_one

In [28]:
calculate_difference(make_wordlist_from_gensimoutput(model_aufklaerung.wv.most_similar(["königin"], topn=100)),
                     make_wordlist_from_gensimoutput(model_kunstepoche.wv.most_similar(["königin"], topn=100)))

0.21

In [29]:
make_wordlist_from_gensimoutput(model_aufklaerung.wv.most_similar(["mensch"]))

['leute',
 'menschlich',
 'menschengeschlecht',
 'menschenglück',
 'mann',
 'jung_mensch',
 'welt',
 'menschenherz',
 'menschenfeind',
 'tugend']

In [30]:
make_wordlist_from_gensimoutput(model_kunstepoche.wv.most_similar(["mensch"]))

['welt',
 'menschlich',
 'natur',
 'glauben',
 'leute',
 'menschheit',
 'ding',
 'menschlichkeit',
 'leidenschaft',
 'mann']

In [31]:
f = open("fasttext_semantic_change_list_drama.csv", "w", encoding = "utf8")
f.write("word,score\n")
f.close()

In [32]:
wordlist_models = set(model_aufklaerung.wv.index_to_key).intersection(set(model_kunstepoche.wv.index_to_key))
word_freq_df = pd.read_csv("code/word_frequency/word_freq_drama.csv")
wordlist = set(wordlist_models).intersection(word_freq_df["word"].loc[word_freq_df["count_complete"]>4])

In [33]:
len(wordlist)

18224

In [34]:
for word in wordlist:
    word_list1 = make_wordlist_from_gensimoutput(model_aufklaerung.wv.most_similar([word], topn=100))
    word_list2 = make_wordlist_from_gensimoutput(model_kunstepoche.wv.most_similar([word], topn=100))
    
    score = calculate_difference(word_list1, word_list2)
    
    f = open("fasttext_semantic_change_list_drama.csv", "a", encoding = "utf8")
    f.write(word.lower() + "," + str(score)+ "\n")
    f.close()                  

In [35]:
word = "müssen"

calculate_difference(make_wordlist_from_gensimoutput(model_aufklaerung.wv.most_similar([word], topn=100)),
                     make_wordlist_from_gensimoutput(model_kunstepoche.wv.most_similar([word], topn=100)))

0.12