In [21]:
import spacy
from collections import Counter
import tomotopy as tp
import os
from tqdm import tqdm
import pandas
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import MDS
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import random

## Load data

In [22]:
# Load german model
nlp = spacy.load('de_core_news_lg')

In [23]:
custom_stopwords = ['B.', '$', 'Fig', 'z.', 'MATH', '=', "S.", "Seite", "2c", "GRAPHIC", "pag", "NB", "C.",
                    "s.", "u.", "v.", "k", "l", "i", "R.", "H.", "de", "F.", "d.", "h.", "c.", "J.", "a.", "M."] + [str(i) for i in range(1000)]

In [24]:
for c in custom_stopwords:
    nlp.vocab[c].is_stop = True
    
# explicitly include words in vocab 
for c in ["a"]:
    nlp.vocab[c].is_stop = False

In [28]:
# Parse files
file_list = []
for file in os.listdir('../data/all_txt/'):
    file_list.append(file)
    
print(file_list)

['CAP1905.txt', 'HAU1853.txt', 'HOS1879.txt', 'KRA1852.txt', 'KUN1863.txt', 'NAU1858.txt', 'OET1866.txt', 'RIE1905.txt', 'THU1877.txt', 'WEI1860.txt', 'WEI1861.txt']


In [29]:
# Create a dictionary of all lemmatized texts
texts = dict()
for file in tqdm(file_list):
    with open(f'../data/all_txt/{file}', 'r', encoding='utf-8') as f:
        text = f.read()
        text = text.replace('¬\n', '')
        doc = nlp(text)
        name = file.split('.')[0]
        
        texts[name] = doc

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [03:30<00:00, 19.15s/it]


## Merge paragraphs

In [45]:
import xml.etree.ElementTree as ET

In [60]:
ns = { "pcgts" : "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" }

In [109]:
folders = [ 
    'CAP1905',
    'HAU1853',
    'HOS1879',
    'KRA1852',
    'NAU1858',
    'OET1866',
    'RIE1905',
    'THU1877',
    'WEI1860',
    'WEI1861',
]

In [44]:
path = '../data/CAP1905/CAP1905/page/'

In [110]:
xmls = []
for file in os.listdir(path):
    xmls.append(file)

In [56]:
paragraphs = []

In [112]:
for folder in folders:
    path = f'../data/{folder}/{folder}/page/'

    for file in os.listdir(path):
        
        tree = ET.parse(path+file)
        root = tree.getroot()

        for region in root.findall(".//pcgts:TextRegion", ns):
            print(region.attrib['custom'])
            if "{type:paragraph;}" in region.attrib['custom']:
                print("true")
                texts = region.findall(".//pcgts:TextEquiv", ns)
                paragraph = texts[-1].find(".//pcgts:Unicode", ns).text
                paragraphs.append(paragraph)
            elif "{type:paragraph-continued;}" in region.attrib['custom']:
                texts = region.findall(".//pcgts:TextEquiv", ns)
                paragraph = texts[-1].find(".//pcgts:Unicode", ns).text
                lasttext = paragraphs[-1]
                paragraph = lasttext+" "+paragraph
                paragraphs.pop()
                paragraphs.append(paragraph)

readingOrder {index:0;} structure {type:other;}
readingOrder {index:0;} structure {type:other;}
readingOrder {index:0;} structure {type:heading;}
readingOrder {index:1;} structure {type:TOC-entry;}
readingOrder {index:2;} structure {type:TOC-entry;}
readingOrder {index:3;} structure {type:TOC-entry;}
readingOrder {index:4;} structure {type:TOC-entry;}
readingOrder {index:0;} structure {type:heading;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} st

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:heading;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:6;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:paragraph;}
true
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {ty

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;} structure {type:heading;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:paragraph;}
true
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;}
readingOrder {index:

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:list-continued;}
readingOrder {index:6;} structure {type:list;}
readingOrder {index:8;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:5;} structure {type:paragraph;}
true
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:8;} structure {type:paragraph;}
true
readingOrder {index:10;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:2;} structure {type:list-continued;}
readingOrder {index

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:6;} structure {type:paragraph;}
true
readingOrder {index:7;} structure {type:paragraph;}
true
readingOrder {index:9;} structure {type:paragraph;}
true
readingOrder {index:10;} structure {type:paragraph;}
true
readingOrder {index:12;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:heading;}
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {index:4;} structure {type:heading;}
readingOrder {index:5;} structure {type:heading;}
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:p

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;} structure {t

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:heading;}
readingOrder {index:5;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:heading;}
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:heading;}
readingOrder {index:2;}
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;} structure {type

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {index:4;}
readingOrder {index:5;}
readingOrder {index:6;} structure {type:paragraph;}
true
readingOrder {index:7;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:5;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:8;

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {index:4;}
readingOrder {index:5;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:5;}
readingOrder {index:6;}
readingOrder {index:7;} structure {type:heading;}
readingOrder {index:8;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readin

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:5;} structure {type:paragraph;}
true
readingOrder {index:6;} structure {type:paragraph;}
true
readingOrder {index:8;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:heading;}
readingOrder {index:3;} structure {type:heading;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:paragraph-continued;}
readingOrder {index:7;} structure {type:paragraph;}
true
readingOrder {index:8;} struc

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:0;} structure {type:other;}
readingOrder {index:0;} structure {type:heading;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {index:4;}
readingOrder {index:5;} structure {type:paragraph-continued;}
readingOrder {index:6;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:0;} structure {type:heading;}
readingOrder {index:1;} structure {type:he

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:heading;}
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:5;} structure {type:footnote;}
readingOrder {index:6;} structure {type:footnote;}
readingOrder {index:0;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:2;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:footnote-continued;}
readingOrder {index:6;} structure {type:footnote;}
readingOrder {index:7;} structure {type:footnote-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:footnote-continued;}
readingOrder {index:7;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:heading;}
readingOrder {index:1;} structure {type:heading;}
readingOrder {index:2;} structure {type:paragraph

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:6;} structure {type:paragraph;}
true
readingOrder {index:7;} structure {type:paragraph-continued;}
readingOrder {index:8;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:paragraph-continued;}
readingOrder {index:7;} structure {type:paragraph;}
true
readingOrder {index:8;} structure {type:paragraph;}
true
readingOrder {index:10;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph-continued;}
readingOrder {index:5

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {inde

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:6;} structure {type:paragraph;}
true
readingOrder {index:8;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:6;} structure {type:paragraph;}
true
readingOrder {index:7;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingO

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:paragraph;}
true
readingOrder {index:7;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:list;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:p

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:6;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:other;}
readingOrder {index:0;} structure {type:other;}
readingOrder {index:0;} structure {type:heading;}
readingOrder {index:1;} structure {type:paragraph;}
true

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:heading;}
readingOrder {index:5;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:5;} structure {type:footnote;}
readingOrder {index:6;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {index:5;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {index:4;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:missing;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:footnote;}
readingOrder {index:6;} structure {type:missing;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:missing;}
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;}
readingOrder {index:5;}
readingOrder {index:6;}
readingOrder {index:7;}
readingOrder {index:9;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:footnote;}
readingOrder {index:4;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:heading;}
readingOrder {index:0;} structure {type:heading;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:7;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:heading;}
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:list;}
readingOrder {index:6;} structure {type:paragraph;}
true
readingOrder {index:7;} structure {type:list-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:list;}
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:list;}
readingOrde

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:heading;}
readingOrder {index:6;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:footnote;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:heading;}
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:8;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-num

readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph;}
true
readingOrder {index:2;} structure {type:paragraph;}
true
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:missing;}
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:3;} structure {type:paragraph;}
true
readingOrder {index:4;} structure {type:paragraph;}
true
readingOrder {index:5;} structure {type:heading;}
readingOrder {index:6;} structure {type:paragraph-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;} structure {type:paragraph-continued;}
readingOrder {index:2;} structure {type:footnote-continued;}
readingOrder {index:0;} structure {type:page-number;}
readingOrder {index:1;}
readingOrder {index:2;}
readingOrder {index:3;}
readingOrder {inde

FileNotFoundError: [WinError 3] The system cannot find the path specified: '../data/RIE1905/RIE1905/page/'

In [106]:
paragraphs

['an Fig. 46, so exotisch-fremdartig auch diese Klang- und Tonfolge\ndem deutschen Leser vorkommen mag, doch bezweifeln (vgl. o. S. 36). Auch dem verminderten Dreiklang in Fig. 47 muss\r\nman Konsonanzwert zusprechen, da durch die Verdoppelung des f\r\ndas nötige Gegengewicht gegen den starken Oberton fis des Grund¬\r\nbasses H geschaffen wird, so dass statt H D ♮fis (a) die Auffassung\r\n(G) h d f mit h als Terz näher gelegt wird. Damit erhält sogar\r\ndie hypophrygische Kirchentonleiter h c d e f g a h, oder modern-\r\nharmonisch ausgedrückt:',
 'eine gewisse Berechtigung und wäre als „Extremes Grossmoll” dem\r\n„Grossmoll” (Phrygisch) in Teil I S. 127 beizugesellen.',
 'Um zu beweisen, dass weder die Anhänger der Durtrübung\r\nnoch der Kopfstellung mit ihrer Mollauffassung ausreichen, sei\r\nnoch Fig. 48 als Beispiel angeführt. Wird hier statt dis d ge¬\r\nspielt, so ist sicherlich das g—h am Schlusse als G-durklang zu\r\nerklären. Mit dis drängt sich dagegen g—h als Mollklang e g h

In [94]:
for text in text:
    try:
        print(text.find(".//pcgts:Unicode", ns).text)
    except:
        pass

Endlich sei die Konsonanzfrage betreffs des Dur- und Moll¬
klanges gegenüber den Dualisten nochmals genau präzisiert: Sie
ist nicht im Anschluss an die Gegenüberstellung v. Oettingens
(tonischer Grundton $$GRAPHIC phonischer Oberton)
so zu entscheiden: „Der Durakkord ist tonisch konsonant und
phonisch dissonant, der Mollakkord ist phonisch konsonant und
tonisch dissonant” (wie auch Riemann, Problemschrift S. 24, be¬
hauptet), sondern so: Der Durakkord c e g ist (in Grund¬
stellung) in jedem tonalen Zusammenhange kon¬
sonant, der Mollakkord a c e ist konsonant als Doppel¬
klang A C e (g) und als einfacher Klang (D fis) a c e,
dagegen dissonant als Doppelklang (F) a C e (g) und
als einfacher Klang A ♮cis e. Dur und Moll sind stets
„monistisch”, d. h. von der Grundbassperspektive
aus zu betrachten, entsprechend dem tatsächlichen
Hören.
Endlich sei die Konsonanzfrage betreffs des Dur- und Moll¬
klanges gegenüber den Dualisten nochmals genau präzisiert: Sie
ist nicht im Anschluss an die G

## Book-level topic modeling

In [36]:
mdl_b = tp.LDAModel(k=5)

In [37]:
for name, txt in texts.items():
    words = [token.text
         for token in txt
         if not token.is_stop and not token.is_punct and not token.is_space]
    mdl_b.add_doc(words)
    


In [38]:
mdl_b.train(2000)

In [39]:
mdl_b.save('bookmodel.bin', full=False)

## 100-word-level topic modeling

In [40]:
mdl_100 = tp.LDAModel(k=5)

In [41]:
for txt in texts.values():
    words = [token.text
         for token in txt
         if not token.is_stop and not token.is_punct and not token.is_space]
    for i in range(0,len(words),100):
        mdl_100.add_doc(words[i: i+100])

In [42]:
mdl_100.train(2000)

In [43]:
mdl_100.save("100model.bin", full=False)

## Chapter-level topic modeling

In [11]:
mdl_c = tp.LDAModel(k=5)

## Paragraph-level topic modeling

In [12]:
mdl_p = tp.LDAModel(k=5)