# Ensemble LDA

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Corpus

In [2]:
import gensim.downloader as api
from gensim.corpora import Dictionary
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = api.load('text8')

dictionary = Dictionary()
for doc in docs:
    dictionary.add_documents([[lemmatizer.lemmatize(token) for token in doc]])
dictionary.filter_extremes(no_below=20, no_above=0.5)

corpus = [dictionary.doc2bow(doc) for doc in docs]

2023-04-04 14:30:15,409 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-04-04 14:30:15,426 : INFO : built Dictionary<2312 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...> from 1 documents (total 10000 corpus positions)
2023-04-04 14:30:15,555 : INFO : adding document #0 to Dictionary<2312 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...>
2023-04-04 14:30:15,573 : INFO : built Dictionary<3906 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...> from 2 documents (total 20000 corpus positions)
2023-04-04 14:30:15,692 : INFO : adding document #0 to Dictionary<3906 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...>
2023-04-04 14:30:15,719 : INFO : built Dictionary<5147 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...> from 3 documents (total 30000 corpus positions)
2023-04-04 14:30:15,846 : INFO : adding document #0 to Dictionary<5147 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']

# Training

In [3]:
from gensim.models import LdaModel
topic_model_class = LdaModel

ensemble_workers = 4
num_models = 8

distance_workers = 4

num_topics = 20
passes = 2

from gensim.models import EnsembleLda
ensemble = EnsembleLda(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    passes=passes,
    num_models=num_models,
    topic_model_class=LdaModel,
    ensemble_workers=ensemble_workers,
    distance_workers=distance_workers
)

print(len(ensemble.ttda))
print(len(ensemble.get_topics()))

2023-04-04 14:33:53,239 : INFO : generating 8 topic models using 4 workers
2023-04-04 14:44:16,722 : INFO : generating a 160 x 160 asymmetric distance matrix...
2023-04-04 14:44:28,490 : INFO : fitting the clustering model, using 4 for min_samples
2023-04-04 14:44:28,569 : INFO : generating stable topics, using 3 for min_cores
2023-04-04 14:44:28,569 : INFO : found 2 clusters
2023-04-04 14:44:28,600 : INFO : found 1 stable topics
2023-04-04 14:44:28,616 : INFO : generating classic gensim model representation based on results from the ensemble
2023-04-04 14:44:28,914 : INFO : using symmetric alpha at 1.0
2023-04-04 14:44:28,914 : INFO : using symmetric eta at 1.0
2023-04-04 14:44:28,914 : INFO : using serial LDA version on this node
2023-04-04 14:44:28,914 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence t

160
1


# Tuning

In [4]:
import numpy as np
shape = ensemble.asymmetric_distance_matrix.shape
without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0], dtype=bool)].reshape(shape[0], -1)
print(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max())

ensemble.recluster(eps=0.09, min_samples=2, min_cores=2)

print(len(ensemble.get_topics()))

2023-04-04 14:44:29,055 : INFO : fitting the clustering model


0.00652840948496336 0.037036847551971244 0.14483295280732034


2023-04-04 14:44:29,134 : INFO : generating stable topics
2023-04-04 14:44:29,134 : INFO : found 4 clusters
2023-04-04 14:44:29,181 : INFO : found 1 stable topics
2023-04-04 14:44:29,181 : INFO : generating classic gensim model representation based on results from the ensemble
2023-04-04 14:44:29,181 : INFO : using symmetric alpha at 1.0
2023-04-04 14:44:29,181 : INFO : using symmetric eta at 1.0
2023-04-04 14:44:29,196 : INFO : using serial LDA version on this node
2023-04-04 14:44:29,212 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
2023-04-04 14:44:29,212 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel<num_terms=20076, num_topics=1, decay=0.5, chunksize=2000> in 0.00s', 'datetime': '2023-04-04T14:44:29.212611', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023

1


# Increasing the Size

In [5]:
from gensim.models import LdaMulticore

model1 = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=9,
    passes=4,
)

model2 = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=11,
    passes=2,
)

# add_model supports various types of input, check out its docstring
ensemble.add_model(model1)
ensemble.add_model(model2)

ensemble.recluster()

print(len(ensemble.ttda))
print(len(ensemble.get_topics()))

2023-04-04 14:44:29,416 : INFO : using symmetric alpha at 0.1111111111111111
2023-04-04 14:44:29,416 : INFO : using symmetric eta at 0.1111111111111111
2023-04-04 14:44:29,432 : INFO : using serial LDA version on this node
2023-04-04 14:44:29,479 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 14000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000
2023-04-04 14:44:29,479 : INFO : training LDA model using 7 processes
2023-04-04 14:44:44,092 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
2023-04-04 14:45:00,493 : INFO : topic #0 (0.111): 0.036*"as" + 0.002*"km" + 0.001*"soviet" + 0.001*"jewish" + 0.001*"energy" + 0.001*"bc" + 0.001*"band" + 0.001*"india" + 0.001*"japanese" + 0.001*"russian"
2023-04-04 14:45:00,498 : INFO : topic #2 (0.111): 0.033*"as" + 0.001*"irish" + 0.001*"km" + 0.001*"emperor" + 0.001*"software" 

180
1
