# Semantic Change

## Schritte

- Lade Metadaten
- Lade Korpus
- Trainiere Modelle
- Vergleiche für alle Worte die 100 nächsten Nachbarn in Aufklaerung und Kunstepoche
    -> Siehe dafür Zeta-Heatmaps Code
- Mach ne csv mit allen Worten und Übereinstimmung in den beiden Modellen

## Metadaten und Corpus

In [1]:
import pandas as pd

meta = pd.read_csv("meta_drama.csv", encoding = "utf8")
meta.head()

Unnamed: 0.1,Unnamed: 0,id,title,author,author_birth_year,period,type,genre,date,file,source,annotation,tokens_cleaned
0,0,1,Der sterbende Cato,Gottsched,1700,Aufklärung,Tragedy,drama,1731,gottsched-der-sterbende-cato.xml,https://github.com/dracor-org/gerdracor,,22047
1,1,2,Ein Deutsches Vorspiel,Neuber,1697,Aufklärung,,drama,1734,neuber-ein-deutsches-vorspiel.xml,https://github.com/dracor-org/gerdracor,,6480
2,2,3,Die Pietisterey im Fischbein-Rocke oder Die Do...,Gottsched,1700,Aufklärung,Comedy,drama,1736,gottschedin-die-pietisterey-im-fischbein-rocke...,https://github.com/dracor-org/gerdracor,,27691
3,3,4,Die von der Weisheit wider die Unwissenheit be...,Neuber,1697,Aufklärung,,drama,1736,neuber-die-beschuetzte-schauspielkunst.xml,https://github.com/dracor-org/gerdracor,,7377
4,4,5,Die Verehrung der Vollkommenheit durch die geb...,Neuber,1697,Aufklärung,,drama,1737,neuber-die-verehrung-der-vollkommenheit.xml,https://github.com/dracor-org/gerdracor,,7091


In [2]:
def load_corpus(path, df):
    from numpy import append 

    texts = []
    filenames = []
    for filename in df["file"]:
        with open(path + filename, 'r', encoding="utf8") as f:
            texts.append(f.read())

        filenames.append(filename)    
    return texts, filenames

In [3]:
def load_corpus_nometa(path):
    from numpy import append 
    from os import listdir
    
    texts = []
    filenames = []
    for filename in listdir(path):
        with open(path + "/" + filename, 'r', encoding = "utf8") as f:
            texts.append(f.read())

        filenames.append(filename)    
    return texts, filenames

In [4]:
texts_aufklaerung, filenames_aufklaerung = load_corpus("corpora/cleaned_normalized/corpus_drama/", meta.loc[meta["author_birth_year"]<1770])
texts_kunstepoche, filenames_kunstepoche = load_corpus("corpora/cleaned_normalized/corpus_drama/", meta.loc[meta["author_birth_year"]>=1770])
texts, filenames = load_corpus("corpora/cleaned_normalized/corpus_drama/", meta)

In [5]:
# texts, filenames = load_corpus_nometa("corpora/cleaned_normalized/corpus_drama/")

In [6]:
# texts_aufklaerung = []
# texts_kunstepoche = []

# for i in range(len(texts)):
#     if "_aufklaerung" in filenames[i]:
#         texts_aufklaerung.append(texts[i])
#     else:
#         texts_kunstepoche.append(texts[i])

In [7]:
len(texts_aufklaerung)

166

In [8]:
texts_aufklaerung[0][:100]

'Trauerspiel Widmung Herr Verfasser Vorrede Ausgabe unterstehen Tragödie drucken lassen Art Gedichten'

In [9]:
len(texts_kunstepoche)

157

In [10]:
texts_kunstepoche[0][:100]

'Rauchfangkehrer unentbehrlich Verräter Herrschafte musikalisch Lustspiel aufzüg Person Frau jung Wit'

In [11]:
len(texts)

323

In [12]:
def make_sent(texts):
    sent = []
    for text in texts:
        wordlist=text.split()
        sent_text = []
        for word in wordlist:
            sent_text.append(word.lower())
        sent.append(sent_text)
    return sent

In [13]:
sent_aufklaerung = make_sent(texts_aufklaerung)
sent_kunstepoche = make_sent(texts_kunstepoche)
sent = make_sent(texts)

In [14]:
sent_aufklaerung[0][:10]

['trauerspiel',
 'widmung',
 'herr',
 'verfasser',
 'vorrede',
 'ausgabe',
 'unterstehen',
 'tragödie',
 'drucken',
 'lassen']

In [15]:
sent_kunstepoche[0][:10]

['rauchfangkehrer',
 'unentbehrlich',
 'verräter',
 'herrschafte',
 'musikalisch',
 'lustspiel',
 'aufzüg',
 'person',
 'frau',
 'jung']

## Basismodell und Hyperparameter

In [16]:
from gensim.models.phrases import Phrases, Phraser
import gensim.models
import multiprocessing

MIN_COUNT = 10
VECTOR_SIZE = 200
WINDOW = 15
ALPHA = 0.1
NEGATIVE = 5

In [17]:
def train_model(words1, words2, epochs1, epochs2):
    phrases = Phrases(words1, min_count=30, progress_per=10000) # detects multi word expressions

    bigram = Phraser(phrases)
    sentences1 = bigram[words1]
    
    model = gensim.models.FastText(min_count=MIN_COUNT,
                                   vector_size=VECTOR_SIZE,
                                   window = WINDOW,
                                   alpha=ALPHA,
                                   workers = multiprocessing.cpu_count()-1,
                                   negative = NEGATIVE,
                                   seed = 42)

    model.build_vocab(corpus_iterable=sentences1)

    model.train(corpus_iterable=sentences1,
                        epochs = epochs1,
                        total_examples=model.corpus_count)
    
    phrases = Phrases(words2, min_count=30, progress_per=10000) # detects multi word expressions

    bigram = Phraser(phrases)
    sentences2 = bigram[words2]
    
    model.build_vocab(corpus_iterable=sentences2,
                      update=True)
    
    model.train(corpus_iterable=sentences2,
                epochs = epochs2,
                total_examples=model.corpus_count)
    
    return model

In [18]:
model_aufklaerung = train_model(sent, sent_aufklaerung, 30, 30)

KeyboardInterrupt: 

In [None]:
model_kunstepoche = train_model(sent, sent_kunstepoche, 30, 30)

In [None]:
model_aufklaerung.wv.most_similar(["königin"])

In [None]:
model_kunstepoche.wv.most_similar(["königin"])

## get wordlist from gensim-output

In [None]:
def make_wordlist_from_gensimoutput(gensim_output):
    wordlist = []

    for word in gensim_output:
        wordlist.append(word[0])

    return wordlist

## Semantic Change

In [None]:
def calculate_difference(list_one, list_two):
    length_list_one = len(list_one)
    
    result = set(list_one).intersection(set(list_two))
    number_of_same_elements = len(result)
    
    return number_of_same_elements/length_list_one

In [None]:
calculate_difference(make_wordlist_from_gensimoutput(model_aufklaerung.wv.most_similar(["königin"], topn=100)),
                     make_wordlist_from_gensimoutput(model_kunstepoche.wv.most_similar(["königin"], topn=100)))

In [None]:
make_wordlist_from_gensimoutput(model_aufklaerung.wv.most_similar(["mensch"]))

In [None]:
make_wordlist_from_gensimoutput(model_kunstepoche.wv.most_similar(["mensch"]))

# Evaluation

In [None]:
df = pd.read_csv("results/word_freq_drama.csv", encoding = "utf8").drop(["Unnamed: 0"], axis=1)
wordlist_freq = list(df["word"].loc[df["count_complete"]>4])
len(wordlist_freq)

In [None]:
model_aufklaerung.wv.index_to_key[1]

In [None]:
wordlist_aufklaerung = []

for i in range(0, len(model_aufklaerung.wv)):
    wordlist_aufklaerung.append(model_aufklaerung.wv.index_to_key[i])
len(wordlist_aufklaerung)

In [None]:
wordlist_kunstepoche = []

for i in range(0, len(model_kunstepoche.wv)):
    wordlist_kunstepoche.append(model_kunstepoche.wv.index_to_key[i])
len(wordlist_kunstepoche)

In [None]:
wordlist = set(wordlist_kunstepoche).intersection(set(wordlist_aufklaerung)).intersection(set(wordlist_freq))

In [None]:
"rüstig" in wordlist

In [None]:
scores = []

for word in wordlist:
    word_list1 = make_wordlist_from_gensimoutput(model_aufklaerung.wv.most_similar([str(word)], topn=100))
    word_list2 = make_wordlist_from_gensimoutput(model_kunstepoche.wv.most_similar([str(word)], topn=100))
    
    scores.append(calculate_difference(word_list1, word_list2))              

In [None]:
finalDf = pd.DataFrame(wordlist, columns = ["word"])
finalDf["semantic_score"] = scores
newDf = finalDf.merge(df, on="word")
newDf.head()

In [None]:
import plotly.express as px

fig = px.box(finalDf, y = "semantic_score")
fig.show()

In [None]:
evaluation_list_change = ["wunderlich", "aufrüsten", "korrumpieren", "unartig",  "merkwürdig", "gemüt", "bedenken", "fräulein", "frauenzimmer", "brav"]
evaluation_list_nochange = ["korruption", "ehrlich", "witzig", "ehre", "gemütlich", "einsam", "geschichte", "aufdringlich", "zudringlich", "bequem", "genie", "blöd"]

In [None]:
len(evaluation_list_change)

In [None]:
len(evaluation_list_nochange)

In [None]:
finalDf.describe()

In [None]:
newDf[newDf["word"].isin(evaluation_list_change)].loc[(newDf["count_aufklaerung"]>=10) & (newDf["count_kunstepoche"]>=10)]

In [None]:
newDf[newDf["word"].isin(evaluation_list_change)].loc[(newDf["count_aufklaerung"]>=10) & (newDf["count_kunstepoche"]>=10)].median()

In [None]:
newDf[newDf["word"].isin(evaluation_list_nochange)].loc[(newDf["count_aufklaerung"]>=10) & (newDf["count_kunstepoche"]>=10)]

In [None]:
newDf[newDf["word"].isin(evaluation_list_nochange)].loc[(newDf["count_aufklaerung"]>=10) & (newDf["count_kunstepoche"]>=10)].median()

In [None]:
newDf.to_csv("semantic_change_freq_results.csv", encoding = "utf8")