In [1]:
import sys
import tomotopy as tp
import numpy as np
import re
import nltk

In [None]:
def hlda_example(input_file, save_path):
    from nltk.stem.porter import PorterStemmer
    from nltk.corpus import stopwords
    try:
        cps = tp.utils.Corpus.load(input_file + '.cached.cps')
    except IOError:
        stemmer = PorterStemmer()
        stops = set(stopwords.words('english'))
        cps = tp.utils.Corpus(
            tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 
            stopwords=lambda x: len(x) <= 2 or x in stops
        )
        cps.process(open(input_file, encoding='utf-8'))
        cps.save(input_file + '.cached.cps')
    
    np.random.seed(42)
    ridcs = np.random.permutation(len(cps))
    test_idcs = ridcs[:20]
    train_idcs = ridcs[20:]

    test_cps = cps[test_idcs]
    train_cps = cps[train_idcs]
    
    mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=4, rm_top=10, corpus=train_cps)
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    for _ in range(0, 1000, 10):
        mdl.train(7)
        mdl.train(3, freeze_topics=True)
        print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k))

    for _ in range(0, 100, 10):
        mdl.train(10, freeze_topics=True)
        print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k))

    mdl.summary()
    print('Saving...', file=sys.stderr, flush=True)
    mdl.save(save_path, True)

    test_result_cps, ll = mdl.infer(test_cps)
    for doc in test_result_cps:
        print(doc.path, doc.get_words(top_n=10))


if __name__ == '__main__':
    hlda_example('enwiki-16000.txt', 'test.hlda.tmm')

Training...


Num docs: 15980 , Vocab size: 17000 , Num words: 3303761
Removed top words: ['thi', 'also', 'first', 'one', 'new', 'state', 'use', 'year', 'two', 'includ']
Iteration: 00010	ll per word: -8.40584	Num. of topics: 1755
Iteration: 00020	ll per word: -8.12848	Num. of topics: 2216
Iteration: 00030	ll per word: -8.06211	Num. of topics: 2350
Iteration: 00040	ll per word: -8.02671	Num. of topics: 2407
Iteration: 00050	ll per word: -8.00521	Num. of topics: 2452
Iteration: 00060	ll per word: -7.98791	Num. of topics: 2483
Iteration: 00070	ll per word: -7.97682	Num. of topics: 2478
Iteration: 00080	ll per word: -7.96595	Num. of topics: 2495
Iteration: 00090	ll per word: -7.95793	Num. of topics: 2504
Iteration: 00100	ll per word: -7.95041	Num. of topics: 2526
Iteration: 00110	ll per word: -7.94312	Num. of topics: 2546
Iteration: 00120	ll per word: -7.93804	Num. of topics: 2531
Iteration: 00130	ll per word: -7.93332	Num. of topics: 2550
Iteration: 00140	ll per word: -7.92975	Num. of topics: 2547
Iter