In [1]:
%%capture
%run 3-tfidf-gensim.ipynb

In [7]:
from gensim.models import LdaModel, CoherenceModel
# %pip install pyldavis==3.3.1
import pyLDAvis, pyLDAvis

The LDA model uses Dirichlet distributions. The parameters of Dirichlet distributions can be either symmetrical, in which case all the values of the variable will have equal weight, or asymmetrical, where some values have higher weight than others. Moreover, the parameters can be equal to, smaller or greater than 1. For parameters equal to 1, the distribution will assign uniform probability. For values greater than 1, more probability will be assigned to the center, while for values smaller than 1 more probability will be assigned to the corners.

In the LDA model, alpha refers to the Dirichlet distribution that assigns documents to topics. For alpha asymmetric, we might risk of having topics more probable than others. For alpha smaller than 1, we are assuming the documents consist of a small number of topics, while for alpha greater than we assume the documents can be composed of more topics. In our case, it'd be better to have a symmetrical alpha, since we don't know if there's a topic much more likely than others.

For eta we apply the same logic but with topics and words. Eta smaller than 1 means we're assuming topics are not composed of many words, while Eta greater than 1 we assume more words constitute topics. In this case, it's better to let the model find the proper settings.

In [3]:
# Fit an LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=6,
                     alpha='symmetric', eta='auto', passes=5, random_state=1)

auto_alpha = lda_model.alpha
auto_eta = lda_model.eta

print(f"Automatically determined alpha: {auto_alpha}")
print(f"Automatically determined eta: {auto_eta}")

Automatically determined alpha: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
Automatically determined eta: [0.21022789 0.15919016 0.15474923 ... 0.20106427 0.22866459 0.15752552]


Coherence is a measure of topics quality returned by the model and can be used for comparison

In [10]:
coherence_model_lda = CoherenceModel(model=lda_model, dictionary = id2word, texts=tokenized_docs, coherence='c_v')
coherence_model_lda.get_coherence()

0.46525658197159864

In [51]:
# Visualize the LDA model using pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='tsne')
pyLDAvis.display(vis)

In [52]:
lda_model[corpus]

# it returns a list of lists, where each inner list contains tuples where the topic ID and the topic probability are contained
# Each inner list is associated to a specific document

# output example
''' [
  DOC1: [(0, 0.05), (1, 0.09), ...]
  DOC2: [(0, 0.1), ...]
  .
  .
  .  
]
'''

' [\n  DOC1: [(0, 0.05), (1, 0.09), ...]\n  DOC2: [(0, 0.1), ...]\n  .\n  .\n  .  \n]\n'

In [53]:
# this function returns the top words for each topic by highest frequency
lda_model.show_topic(0, topn=10)

[('freeman', 0.021430584),
 ('gateway', 0.01636593),
 ('comet', 0.010580119),
 ('loser', 0.00937404),
 ('willie', 0.0079039885),
 ('beam', 0.0059753554),
 ('polytechnic', 0.0056695975),
 ('worcester', 0.005591045),
 ('jockey', 0.0054195723),
 ('gene', 0.005339657)]

In [54]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=df['Clean_Content']):
    # Init output
    sent_topics_df = pd.DataFrame(columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        # we want to sort the tuples in the list 'row' by descending order of the second element, which is the topic probability
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic, i.e. tuple with the highest topic probability
                wp = ldamodel.show_topic(topic_num, topn = 15)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df.loc[len(sent_topics_df)] = [int(topic_num), round(prop_topic, 4), topic_keywords]
            else:
                break
    # Add original text to the end of the output
    sent_topics_df = pd.concat([sent_topics_df, texts], axis=1)
    return sent_topics_df

df_topic_sents_keywords = format_topics_sentences()


In [55]:
df_topic_sents_keywords

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Clean_Content
0,4,0.4831,"government, state, work, world, distribution, ...",subject atheist summary music relate atheism a...
1,4,0.6427,"government, state, work, world, distribution, ...",subject introduction atheism summary file post...
2,4,0.9072,"government, state, work, world, distribution, ...",subject gospel date organization technical qui...
3,4,0.6249,"government, state, work, world, distribution, ...",subject separation organization mantis become ...
4,1,0.4829,"god, church, men, faith, human, gay, evidence,...",strom subject al ax match fund boy distributio...
...,...,...,...,...
11309,1,0.6896,"god, church, men, faith, human, gay, evidence,...",subject promise organization buffalo ye learn ...
11310,1,0.4565,"god, church, men, faith, human, gay, evidence,...",subject promise organization tourist bureau ba...
11311,1,0.8041,"god, church, men, faith, human, gay, evidence,...",subject protestant organization tourist bureau...
11312,4,0.8557,"government, state, work, world, distribution, ...",cutter subject backing tape distribution world...


In [56]:
df_topic_sents_keywords['Dominant_Topic'].unique()

array([4, 1, 2, 3, 5, 0], dtype=int64)

In [57]:
df_topic_distribution = df_topic_sents_keywords.groupby(['Dominant_Topic', 'Topic_Keywords']).size().sort_values(ascending=False).reset_index(name='count')

df_topic_distribution

Unnamed: 0,Dominant_Topic,Topic_Keywords,count
0,4,"government, state, work, world, distribution, ...",9316
1,3,"card, drive, color, monitor, window, video, sa...",838
2,1,"god, church, men, faith, human, gay, evidence,...",687
3,2,"team, hockey, game, league, season, player, pl...",437
4,5,"bike, honda, verdict, slick, propulsion, jet, ...",22
5,0,"freeman, gateway, comet, loser, willie, beam, ...",14
