In [1]:
import spacy
from collections import Counter
import tomotopy as tp
import os
from tqdm import tqdm
import pandas

## Prepare data

In [3]:
# Load german model
nlp = spacy.load('de_core_news_lg')

In [4]:
# Parse files
file_list = []
for file in os.listdir('data/all_txt/'):
    file_list.append(file)

In [5]:
# Create a dictionary of all lemmatized texts
texts = dict()
for file in tqdm(file_list):
    with open(f'data/all_txt/{file}', 'r', encoding='utf-8') as f:
        text = f.read()
        text = text.replace('¬\n', '')
        doc = nlp(text)
        name = file.split('.')[0]
        
        texts[name] = doc
        

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [08:32<00:00, 64.03s/it]


In [6]:
entries = []
for name, txt in texts.items():
    words = [token.text
         for token in txt
         if not token.is_stop and not token.is_punct and not token.is_space]
    nouns = [token.text
         for token in txt
         if (not token.is_stop and
             not token.is_punct and
             token.pos_ == "NOUN")]
    vocab = set(words)
    
    # most common tokens
    word_freq = Counter(words)
    common_words = word_freq.most_common(10)

    # most common noun tokens
    noun_freq = Counter(nouns)
    common_nouns = noun_freq.most_common(10)

    # vocabulary size
    vocab_size = len(vocab)
    
    entry = {
        'name': name,
        'common words': common_words,
        'common nouns': common_nouns,
        'vocabulary size': vocab_size,
    }
    
    entries.append(entry)
    

In [11]:
glob_words = []
glob_nouns = []

for name, txt in texts.items():
    for token in txt:
        if not token.is_stop and not token.is_punct and not token.is_space:
            glob_words.append(token.text)
        if (not token.is_stop and not token.is_punct and token.pos_ == "NOUN"):
            glob_nouns.append(token.text)

glob_vocab = set(glob_words)

# most common tokens
word_freq = Counter(glob_words)
glob_common_words = word_freq.most_common(10)

# most common noun tokens
noun_freq = Counter(glob_nouns)
glob_common_nouns = noun_freq.most_common(10)

# vocabulary size
glob_vocab_size = len(glob_vocab)

glob_entry = {
    'name': 'all texts',
    'common words': glob_common_words,
    'common nouns': glob_common_nouns,
    'vocabulary size': glob_vocab_size,
}

entries.append(glob_entry)


In [16]:
glob_entry

{'name': 'all texts',
 'common words': [('$', 922),
  ('Töne', 662),
  ('c', 605),
  ('e', 543),
  ('Terz', 476),
  ('g', 450),
  ('Ton', 436),
  ('C', 435),
  ('Folge', 407),
  ('Bedeutung', 406)],
 'common nouns': [('Töne', 662),
  ('Terz', 439),
  ('Ton', 435),
  ('Folge', 407),
  ('Bedeutung', 406),
  ('Tonart', 352),
  ('Grundton', 348),
  ('B.', 285),
  ('Accorde', 282),
  ('Dissonanz', 254)],
 'vocabulary size': 19576}

In [12]:
df = pandas.DataFrame(data=entries)

In [15]:
df.to_csv('text_stats.csv', encoding='utf-8')

In [20]:
words = [token.text
         for token in doc
         if not token.is_stop and not token.is_punct and not token.is_space]

In [21]:
nouns = [token.text
         for token in doc
         if (not token.is_stop and
             not token.is_punct and
             token.pos_ == "NOUN")]

In [25]:
vocab = set(words)

In [26]:
# most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common(15)

# most common noun tokens
noun_freq = Counter(nouns)
common_nouns = noun_freq.most_common(15)

# vocabulary size
vocab_size = len(vocab)

In [23]:
common_words

[('c', 161),
 ('S.', 159),
 ('e', 159),
 ('g', 128),
 ('C', 122),
 ('$', 118),
 ('Riemann', 104),
 ('d', 92),
 ('h', 85),
 ('f', 82),
 ('Fig', 71),
 ('Moll', 62),
 ('GRAPHIC', 58),
 ('Terz', 55),
 ('Grundton', 51)]

In [24]:
common_nouns

[('S.', 154),
 ('Terz', 52),
 ('Grundton', 51),
 ('Töne', 47),
 ('Auffassung', 36),
 ('Dur', 35),
 ('Tonika', 34),
 ('Umkehrung', 33),
 ('Konsonanz', 33),
 ('Akkord', 32),
 ('g', 30),
 ('Ton', 29),
 ('Unterdominante', 29),
 ('Septime', 28),
 ('Sexte', 27)]

In [28]:
vocab_size

4921

In [25]:
len(words)

13368

In [31]:
mdl = tp.LDAModel(k=10)

In [32]:
for txt in texts.values():
    words = [token.text
         for token in txt
         if not token.is_stop and not token.is_punct and not token.is_space]
    for i in range(0,len(words),100):
        mdl.add_doc(words[i: i+100])

In [33]:
mdl.train(100)

In [34]:
for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))

Top 10 words of topic #0
[('c', 0.041490588337183), ('g', 0.030912037938833237), ('e', 0.02754613384604454), ('d', 0.023974156007170677), ('f', 0.021776016801595688), ('h', 0.01476944237947464), ('1', 0.013739064335823059), ('phon', 0.009686242789030075), ('=', 0.008930631913244724), ('Klänge', 0.008449789136648178)]
Top 10 words of topic #1
[('$', 0.092230886220932), ('MATH', 0.032011374831199646), ('Töne', 0.02130790427327156), ('=', 0.01390550471842289), ('Terz', 0.011804825626313686), ('GRAPHIC', 0.011204631067812443), ('Octave', 0.009304014965891838), ('reinen', 0.008903885260224342), ('Intervalle', 0.008803852833807468), ('Schwingungszahlen', 0.008503755554556847)]
Top 10 words of topic #2
[('Metrum', 0.015257209539413452), ('Einheit', 0.015024293214082718), ('Bestimmung', 0.014209085144102573), ('metrischen', 0.013859710656106472), ('Glied', 0.013626793399453163), ('Ordnung', 0.012811585329473019), ('metrische', 0.011763460002839565), ('Bedeutung', 0.009783667512238026), ('Forma

In [35]:
import pyLDAvis

topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

prepared_data = pyLDAvis.prepare(
    topic_term_dists,
    doc_topic_dists,
    doc_lengths,
    vocab,
    term_frequency,
    sort_topics=False,
    start_index=0,
)



ModuleNotFoundError: No module named 'pyLDAvis'

In [None]:
pyLDAvis.save_html(prepared_data, "ldavis.html")

In [None]:
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.display(prepared_data)