In [6]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import LdaModel
import logging

# 1. 学習中の状況を出力するフォーマットの指定
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

num_topics=200

dictionary = Dictionary.load_from_text('../data/jawiki_wordids.txt.bz2')
tfidf_corpus = MmCorpus('../data/jawiki_tfidf.mm')
lda = LdaModel(corpus=tfidf_corpus, id2word=dictionary, iterations=400, num_topics=num_topics, passes=20)
lda.save('../data/lda_200.model')

2024-09-22 18:41:14,519 : INFO : loaded corpus index from ../data/jawiki_tfidf.mm.index
2024-09-22 18:41:14,519 : INFO : initializing cython corpus reader from ../data/jawiki_tfidf.mm
2024-09-22 18:41:14,520 : INFO : accepted corpus with 1307644 documents, 100000 features, 220154677 non-zero entries
2024-09-22 18:41:14,524 : INFO : using symmetric alpha at 0.005
2024-09-22 18:41:14,525 : INFO : using symmetric eta at 0.005
2024-09-22 18:41:14,536 : INFO : using serial LDA version on this node
2024-09-22 18:41:15,819 : INFO : running online (multi-pass) LDA training, 200 topics, 20 passes over the supplied corpus of 1307644 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 400x with a convergence threshold of 0.001000
2024-09-22 18:41:16,467 : INFO : PROGRESS: pass 0, at document #2000/1307644
2024-09-22 18:41:39,677 : INFO : merging changes from 2000 documents into a model of 1307644 documents
2024-09-22 18:41:40,402 : INFO : to

In [54]:
# 6b. LDAモデルによって得られるトピックの抽出
top_topics = lda.top_topics(tfidf_corpus)
coherences = [coherence for topic_words, coherence in top_topics]

# 6c. 評価指標コヒーレンスを求める
avg_topic_coherence = sum(coherences) / num_topics
print(f'Average topic coherence: {avg_topic_coherence:.4f}.')

2024-09-21 21:43:38,925 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2024-09-21 21:43:39,381 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2024-09-21 21:43:39,725 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2024-09-21 21:43:40,098 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2024-09-21 21:43:40,353 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2024-09-21 21:43:40,613 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2024-09-21 21:43:40,903 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2024-09-21 21:43:41,240 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2024-09-21 21:43:41,540 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2024-09-21 21:43:41,835 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2024-09-21 21:43:42,107 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2024-09-21 21:43:42

Average topic coherence: -3.3802.


In [2]:
import pickle

with open('../data/jawiki_bow.mm.metadata.cpickle', 'rb') as f:
    docno2metadata = pickle.load(f)
title2docno = {tup_title[1]: int(docno) for docno, tup_title in docno2metadata.items()}

In [90]:
for title in ['モモ']:
    topics = lda[tfidf_corpus[title2docno[title]]]
    topic = sorted(topics, key=lambda t: t[1], reverse=True)[0][0]
    print('=== %s (topic %d) ===' % (title, topic))
    for word, p_word in lda.show_topic(topic, topn=10):
        print('%.5f\t%s' % (p_word, word))

=== カキ (topic 79) ===
0.02045	分布
0.01361	分類
0.01293	植物
0.01209	化石
0.01014	絶滅
0.00983	cm
0.00853	個体
0.00822	mm
0.00789	動物
0.00701	先端
