In [3]:
import spacy
from collections import Counter
import tomotopy as tp
import os
from tqdm import tqdm
import pandas
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import MDS
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import random

In [4]:
# Remove p.240-end from Hauptmann in "page" folder

## Merge paragraphs

In [5]:
import xml.etree.ElementTree as ET

In [6]:
ns = { "pcgts" : "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" }

In [7]:
folders = [ 
    'CAP1905',
    'HAU1853',
    'HOS1879',
    'KRA1852',
    'KUN1863',
    'NAU1858',
    'OET1866',
    'RIE1905',
    'THU1877',
    'WEI1860',
    'WEI1861',
]

In [8]:
paragraphs = []

In [9]:
for folder in tqdm(folders):
    path = f'../data/{folder}/{folder}/page/'

    for file in os.listdir(path):
        
        tree = ET.parse(path+file)
        root = tree.getroot()
        
        provisory = []

        for region in root.findall(".//pcgts:TextRegion", ns):
            if "{type:paragraph-continued;}" in region.attrib['custom']:          
                texts = region.findall(".//pcgts:TextEquiv", ns)
                paragraph = texts[-1].find(".//pcgts:Unicode", ns).text
                provisory.append(paragraph)
            elif "{type:paragraph;}" in region.attrib['custom']:
                texts = region.findall(".//pcgts:TextEquiv", ns)
                paragraph = texts[-1].find(".//pcgts:Unicode", ns).text
                if provisory:
                    newpar = ""
                    for i in range(len(provisory)):
                        newpar += provisory[i] + " "
                    newpar += paragraph
                    provisory = []
                    paragraphs.append(newpar)
                else:
                    paragraphs.append(paragraph)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:15<00:00,  1.37s/it]


In [10]:
len(paragraphs)

1814

## Load data

In [11]:
# Load german model
nlp = spacy.load('de_core_news_lg')

In [12]:
custom_stopwords = ['B.', '$', 'Fig', 'z.', 'MATH', '=', "S.", "Seite", "2c", "GRAPHIC", "pag", "NB", "C.",
                    "s.", "u.", "v.", "k", "l", "i", "R.", "H.", "de", "F.", "d.", "h.", "c.", "J.", "a.", "M."] + [str(i) for i in range(1000)]

In [13]:
for c in custom_stopwords:
    nlp.vocab[c].is_stop = True
    
# explicitly include words in vocab 
for c in ["a"]:
    nlp.vocab[c].is_stop = False

In [19]:
lemmatizer = nlp.get_pipe("lemmatizer")

In [20]:
# Parse files
file_list = []
for file in os.listdir('../data/all_txt/'):
    file_list.append(file)
    
print(file_list)

['CAP1905.txt', 'HAU1853.txt', 'HOS1879.txt', 'KRA1852.txt', 'KUN1863.txt', 'NAU1858.txt', 'OET1866.txt', 'RIE1905.txt', 'THU1877.txt', 'WEI1860.txt', 'WEI1861.txt']


In [21]:
# Create a dictionary of all lemmatized texts
texts = dict()
for file in tqdm(file_list):
    with open(f'../data/all_txt/{file}', 'r', encoding='utf-8') as f:
        text = f.read()
        text = text.replace('¬\n', '')
        doc = nlp(text)
        name = file.split('.')[0]
        
        texts[name] = doc

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [04:12<00:00, 22.92s/it]


## Book-level topic modeling

In [36]:
mdl_b = tp.LDAModel(k=5)

In [37]:
for name, txt in texts.items():
    words = [token.text
         for token in txt
         if not token.is_stop and not token.is_punct and not token.is_space]
    mdl_b.add_doc(words)
    


In [38]:
mdl_b.train(2000)

In [39]:
mdl_b.save('bookmodel.bin', full=False)

## 100-word-level topic modeling

In [40]:
mdl_100 = tp.LDAModel(k=5)

In [41]:
for txt in texts.values():
    words = [token.text
         for token in txt
         if not token.is_stop and not token.is_punct and not token.is_space]
    for i in range(0,len(words),100):
        mdl_100.add_doc(words[i: i+100])

In [42]:
mdl_100.train(2000)

In [43]:
mdl_100.save("100model.bin", full=False)

## Paragraph-level topic modeling

In [22]:
# Clean paragraphs
clean_pars = []
for par in tqdm(paragraphs):
    newpar = par.replace('¬\n', '')
    doc = nlp(newpar)
    clean_pars.append(doc)

100%|██████████████████████████████████████████████████████████████████████████████| 1814/1814 [01:10<00:00, 25.85it/s]


In [23]:
mdl_p = tp.LDAModel(k=5)

In [24]:
for par in clean_pars:
    words = [token.lemma_
         for token in par
         if not token.is_stop and not token.is_punct and not token.is_space]
    mdl_p.add_doc(words)

In [25]:
mdl_p.train(2000)

In [26]:
mdl_p.save("parmodel.bin", full=False)

In [36]:
text = "musikalisch Herr und musikalische Frau"
doc = nlp(text)
print([token.lemma_ for token in doc])

['musikalisch', 'Herr', 'und', 'musikalische', 'Frau']


  and should_run_async(code)


## Chapter-level topic modeling

In [11]:
mdl_c = tp.LDAModel(k=5)

In [None]:
# Use nltk for stemming

### pyLDAvis

In [30]:
import warnings
import pyLDAvis

In [32]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    #pyLDAvis.save_html(prepared_data, "ldavis.html")
    
    pyLDAvis.enable_notebook()

  and should_run_async(code)


In [147]:
mdl = mdl_b

topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

prepared_data = pyLDAvis.prepare(
    topic_term_dists,
    doc_topic_dists,
    doc_lengths,
    vocab,
    term_frequency,
    sort_topics=False,
    start_index=1,
)
pyLDAvis.save_html(prepared_data, "bookldavis.html")
pyLDAvis.display(prepared_data)

  and should_run_async(code)


In [148]:
mdl = mdl_100

topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

prepared_data = pyLDAvis.prepare(
    topic_term_dists,
    doc_topic_dists,
    doc_lengths,
    vocab,
    term_frequency,
    sort_topics=False,
    start_index=1,
)
pyLDAvis.save_html(prepared_data, "100ldavis.html")
pyLDAvis.display(prepared_data)

  and should_run_async(code)


In [33]:
import pyLDAvis
import numpy as np 
import warnings

mdl = mdl_p

topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

prepared_data = pyLDAvis.prepare(
    topic_term_dists,
    doc_topic_dists,
    doc_lengths,
    vocab,
    term_frequency,
    sort_topics=False,
    start_index=1,
)
pyLDAvis.save_html(prepared_data, "paragraphldavis.html")
pyLDAvis.display(prepared_data)

  and should_run_async(code)
