In [2]:
import spacy
from collections import Counter
import tomotopy as tp
import os
from tqdm import tqdm
import pandas
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import MDS
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import random

In [3]:
# Remove p.240-end from Hauptmann in "page" folder

## Merge paragraphs

In [4]:
import xml.etree.ElementTree as ET

In [5]:
ns = { "pcgts" : "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" }

In [6]:
folders = [ 
    'CAP1905',
    'HAU1853',
    'HOS1879',
    'KRA1852',
    'KUN1863',
    'NAU1858',
    'OET1866',
    'RIE1905',
    'THU1877',
    'WEI1860',
    'WEI1861',
]

In [7]:
paragraphs = []

In [8]:
for folder in tqdm(folders):
    path = f'../data/{folder}/{folder}/page/'

    for file in os.listdir(path):
        
        tree = ET.parse(path+file)
        root = tree.getroot()
        
        provisory = []

        for region in root.findall(".//pcgts:TextRegion", ns):
            if "{type:paragraph-continued;}" in region.attrib['custom']:          
                texts = region.findall(".//pcgts:TextEquiv", ns)
                paragraph = texts[-1].find(".//pcgts:Unicode", ns).text
                provisory.append(paragraph)
            elif "{type:paragraph;}" in region.attrib['custom']:
                texts = region.findall(".//pcgts:TextEquiv", ns)
                paragraph = texts[-1].find(".//pcgts:Unicode", ns).text
                if provisory:
                    newpar = ""
                    for i in range(len(provisory)):
                        newpar += provisory[i] + " "
                    newpar += paragraph
                    provisory = []
                    paragraphs.append(newpar)
                else:
                    paragraphs.append(paragraph)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00,  6.54it/s]


In [9]:
len(paragraphs)

1814

## Load data

In [10]:
# Load german model
#nlp = spacy.load('de_core_news_lg')
nlp = spacy.load('de_dep_news_trf')

In [25]:
custom_stopwords = ['B.', '$', 'Fig', 'z.', 'MATH', '=', "S.", "Seite", "2c", "GRAPHIC", "pag", "NB", "C.",
                    "s.", "u.", "v.", "k", "l", "i", "i.", "R.", "H.", "de", "F.", "d.", "h.", "c.", "J.", "a.", "M.",
                    ",",
                    "enthalten",
                    "vgl.",
                    "blos",
                    "blosse",
                    "blossen",
                    "nen",
                    "nämlich",
                    "giebt",
                    "lassen",
                    "bringen",
                    "bringende",
                    "geben",
                    "finden",
                    "namentlich",
                    "meist",
                    "meiste",
                    "meisten",
                    "meistens",
                    "stellen",
                    "halten",
                    "beruhen",
                    "nunmehr",
                    "mithin",
                    "sogar",
                    "sofort",
                    "gar",
                    "stets",
                    "et",
                    "la",
                   ] + [str(i) for i in range(1000)]

  and should_run_async(code)


In [26]:
for c in custom_stopwords:
    nlp.vocab[c].is_stop = True
    
# explicitly include words in vocab 
for c in ["a"]:
    nlp.vocab[c].is_stop = False

  and should_run_async(code)


## Paragraph-level topic modeling

In [28]:
# Clean paragraphs
clean_pars = []
for par in tqdm(paragraphs):
    newpar = par.replace('¬\n', '')
    newpar = newpar.replace('¬', '')
    doc = nlp(newpar)
    clean_pars.append(doc)

  and should_run_async(code)
100%|██████████████████████████████████████████████████████████████████████████████| 1814/1814 [24:20<00:00,  1.24it/s]


In [29]:
mdl_p = tp.LDAModel(k=5)

  and should_run_async(code)


In [30]:
for par in clean_pars:
    words = [token.lemma_
         for token in par
         if not token.is_stop and not token.is_punct and not token.is_space]
    mdl_p.add_doc(words)

  and should_run_async(code)


In [31]:
mdl_p.train(2000)

  and should_run_async(code)


In [32]:
mdl_p.save("parmodeltrf4.bin", full=False)

  and should_run_async(code)


### pyLDAvis

In [33]:
import pyLDAvis
import numpy as np 
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    #pyLDAvis.save_html(prepared_data, "ldavis.html")
    pyLDAvis.enable_notebook()

mdl = mdl_p

topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

prepared_data = pyLDAvis.prepare(
    topic_term_dists,
    doc_topic_dists,
    doc_lengths,
    vocab,
    term_frequency,
    sort_topics=False,
    start_index=1,
)
pyLDAvis.save_html(prepared_data, "paragraphtrfldavis.html")
pyLDAvis.display(prepared_data)

  and should_run_async(code)


In [35]:
#Generate models for 1-12 topics

for i in range(1,11):
    mdl_p = tp.LDAModel(k=i+1)
    
    for par in clean_pars:
        words = [token.lemma_
             for token in par
             if not token.is_stop and not token.is_punct and not token.is_space]
        mdl_p.add_doc(words)
    
    mdl_p.train(2000)
    
    mdl = mdl_p

    topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
    doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
    doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
    vocab = list(mdl.used_vocabs)
    term_frequency = mdl.used_vocab_freq

    prepared_data = pyLDAvis.prepare(
        topic_term_dists,
        doc_topic_dists,
        doc_lengths,
        vocab,
        term_frequency,
        sort_topics=False,
        start_index=1,
    )
    pyLDAvis.save_html(prepared_data, f"paragraphtrfldavis{i+1}.html")


  and should_run_async(code)
