In [21]:
import spacy
from collections import Counter
import tomotopy as tp
import os
from tqdm import tqdm
import pandas
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import MDS
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import random

## Load data

In [22]:
# Load german model
nlp = spacy.load('de_core_news_lg')

In [23]:
custom_stopwords = ['B.', '$', 'Fig', 'z.', 'MATH', '=', "S.", "Seite", "2c", "GRAPHIC", "pag", "NB", "C.",
                    "s.", "u.", "v.", "k", "l", "i", "R.", "H.", "de", "F.", "d.", "h.", "c.", "J.", "a.", "M."] + [str(i) for i in range(1000)]

In [24]:
for c in custom_stopwords:
    nlp.vocab[c].is_stop = True
    
# explicitly include words in vocab 
for c in ["a"]:
    nlp.vocab[c].is_stop = False

In [28]:
# Parse files
file_list = []
for file in os.listdir('../data/all_txt/'):
    file_list.append(file)
    
print(file_list)

['CAP1905.txt', 'HAU1853.txt', 'HOS1879.txt', 'KRA1852.txt', 'KUN1863.txt', 'NAU1858.txt', 'OET1866.txt', 'RIE1905.txt', 'THU1877.txt', 'WEI1860.txt', 'WEI1861.txt']


In [29]:
# Create a dictionary of all lemmatized texts
texts = dict()
for file in tqdm(file_list):
    with open(f'../data/all_txt/{file}', 'r', encoding='utf-8') as f:
        text = f.read()
        text = text.replace('¬\n', '')
        doc = nlp(text)
        name = file.split('.')[0]
        
        texts[name] = doc

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [03:30<00:00, 19.15s/it]


## Book-level topic modeling

In [36]:
mdl_b = tp.LDAModel(k=5)

In [37]:
for name, txt in texts.items():
    words = [token.text
         for token in txt
         if not token.is_stop and not token.is_punct and not token.is_space]
    mdl_b.add_doc(words)
    


In [38]:
mdl_b.train(2000)

In [39]:
mdl_b.save('bookmodel.bin', full=False)

## 100-word-level topic modeling

In [40]:
mdl_100 = tp.LDAModel(k=5)

In [41]:
for txt in texts.values():
    words = [token.text
         for token in txt
         if not token.is_stop and not token.is_punct and not token.is_space]
    for i in range(0,len(words),100):
        mdl_100.add_doc(words[i: i+100])

In [42]:
mdl_100.train(2000)

In [43]:
mdl_100.save("100model.bin", full=False)

## Chapter-level topic modeling

In [11]:
mdl_c = tp.LDAModel(k=5)

## Paragraph-level topic modeling

In [12]:
mdl_p = tp.LDAModel(k=5)