# Implementation of  *Latent Dirichlet Allocation* algorithm with Gensim and Mallet
---

In [1]:
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim.test.utils 
import spacy
import en_core_web_sm
import nltk 
#nltk.download('stopwords') - if needed
# from nltk.corpus import stopwords

from gensim.parsing.preprocessing import STOPWORDS
import tqdm
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
%matplotlib inline
import matplotlib.pyplot as plt

import os
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME':r'C:/new_mallet/mallet-2.0.8/'}) 
#You should update this path as per the path of Mallet directory on your system.
mallet_path = 'C:\\new_mallet\\mallet-2.0.8\\bin\\mallet' 
#You should update this path as per the path of Mallet directory on your system.

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Datasets/mann_ki_baat.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,Speech_date,Speech
0,"03 Oct, 2014","My Dear Countrymen,Today is the holy festival ..."
1,"02 Nov, 2014","My dear fellow countrymen, I am with you again..."
2,"14 Dec, 2014","My Dear Fellow Countrymen,Today I have this gr..."
3,"27 Jan, 2015","Today, Shri Barack Obama, President of the Uni..."
4,"22 Feb, 2015","Hello, my young friends. Today probably the en..."
...,...,...
68,"29 Nov, 2020","My dear countrymen,Namaskar! I want to share a..."
69,"27 Dec, 2020","My dear countrymen,Namaskar. Today is the 27th..."
70,"31 Jan, 2021","My dear countrymen,Namaskar. When I express Ma..."
71,"28 Feb, 2021","My dear countrymen,Namaskar. Yesterday was the..."


In [3]:
def remove_stopwords(texts):
    return[[word for word in simple_preprocess(str(doc)) if word not in all_stopwords_gensim]
            for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]#Turn words into lemmas

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [77]:
bigram = gensim.models.Phrases(df['Speech'], min_count = 5, threshold = 100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

all_stopwords_gensim = STOPWORDS.union(set(['time', 'new', 'mr.', 'mrs.', 'ki', 'th', 'today', 'way', 'am', 'address', 'own',
                                            'true', 'mann', 'baat', 'people', 'countryman', 'hindi', 'year', 'new', 'day',
                                            'life', 'friend', 'friends', 'month', 'world', 'many', 'dr.', 'pm', 'modi', 'ji', 
                                            'pe', 'cook', 'brinjal', 'st', 'nd', 'rd', 'sir', 'dear', 'morning', 'sunday',
                                            'version', 'speech', 'sundays', 'authoritative', 'rendering', 'english', 'original',
                                            'bag', 'suggestion', 'next', 'step', 'numerous', 'dear', 'fellow','government', 
                                            'india', 'country', 'bharat', 'indian', 'message', 'event', 'nation', 'national',
                                            'january', 'december', 'march', 'june', 'rt', 'ak', 'usman', 'usman-', 'kaniga-', 
                                            'decision', 'scheme', 'self', 'thing', 'region', 'reliant', 'countrymen','greetings',
                                            'like', 'share', 'ii', 'prime', 'minister', 'thing', 'lot', 'thought', 'issue',
                                            'great', 'good', 'big', 'day', 'year', 'entire', 'place']))

In [78]:
data_words_nostops = remove_stopwords(df['Speech'])

data_words_bigram = make_bigrams(data_words_nostops)

nlp = en_core_web_sm.load(disable = ['parser', 'ner'])
data_lemma = lemmatization(data_words_bigram, allowed_postags=['NOUN', 'ADJ'])

In [79]:
id2word = corpora.Dictionary(data_lemma)
#corpus
texts = data_lemma
#term document matrix
corpus = [id2word.doc2bow(text) for text in texts]

In [119]:
# coherence = []
# for k in range(2,25):
#     print('Round: '+str(k))
#     Lda = gensim.models.ldamodel.LdaModel
#     sample_ldamodel = Lda(corpus, num_topics=k, id2word = id2word, passes=10, iterations=20, chunksize = 2)
    
#     cm = gensim.models.coherencemodel.CoherenceModel(model=sample_ldamodel, texts=data_lemma, dictionary=id2word, coherence='c_v')
#     pp = sample_ldamodel.log_perplexity(corpus, total_docs=73)
#     coherence.append((k,cm.get_coherence(),pp))
    

# # model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemma, start=2, limit=16, step=1)

# x = [x[0] for x in coherence]
# y1 = [x[1] for x in coherence]
# y2 = [x[2] for x in coherence]

# # plt.figure(figsize=(12, 10))
# # plt.plot(x, y1)
# # plt.plot(x, y2)
# # plt.scatter(x, y1)
# # plt.scatter(x, y2)
# # plt.title('Gensim LDA - Number of Topics vs. Coherence')
# # plt.xlabel("Num Topics")
# # plt.ylabel("Coherence score")
# # plt.savefig('Images/gensim_topics.png')
# # plt.show()

In [120]:
# # fig, (ax1, ax2) = plt.subplots(2)
# # fig.suptitle('Axes values are scaled individually by default')
# # # fig.set_title('Gensim LDA - Number of Topics vs. Coherence')
# # ax1.plot(x, y1)
# # # ax1.xlabel("Num Topics")
# # # ax1.ylabel("Coherence score")
# # ax2.plot(x, y2)
# # # ax2.xlabel("Num Topics")
# # # ax2.ylabel("Perplexity")
# import plotly
# from plotly.subplots import make_subplots
# import plotly.graph_objects as go

# fig = make_subplots( rows=2, cols=1, subplot_titles=("Coherence score", "Perplexity"), shared_xaxes=True,
#                     vertical_spacing=0.02)

# fig.add_trace(go.Scatter(name="Coherence score", x=x, y=y1, mode='lines+markers'), row=1, col=1)

# fig.add_trace(go.Scatter(name="Perplexity", x=x, y=y2, mode='lines+markers'), row=2, col=1)

# fig.update_layout(height=600, width=1000, title_text="Gensim LDA - Number of Topics vs. Coherence and Perplexity", xaxis1 = dict(
#         tickmode = 'linear', tick0 = 1, dtick = 1), xaxis2 = dict(tickmode = 'linear', tick0 = 1, dtick = 1))
# fig.write_html("Images/Gensim_optimal.html")
# fig.show()

In [67]:
coherence = []
for k in range(2,25):
    print('Round: '+str(k))
    Lda = gensim.models.ldamodel.LdaModel
    sample_ldamodel = Lda(corpus, num_topics=k, id2word = id2word, passes=5, iterations=8, chunksize = 2)
    
    cm = gensim.models.coherencemodel.CoherenceModel(model=sample_ldamodel, texts=data_lemma, dictionary=id2word, coherence='c_v')
    pp = sample_ldamodel.log_perplexity(corpus, total_docs=73)
    coherence.append((k,cm.get_coherence(),pp))
    

# model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemma, start=2, limit=16, step=1)

x = [x[0] for x in coherence]
y1 = [x[1] for x in coherence]
y2 = [x[2] for x in coherence]

fig = make_subplots( rows=2, cols=1, subplot_titles=("Coherence score", "Perplexity"), shared_xaxes=True,
                    vertical_spacing=0.02)

fig.add_trace(go.Scatter(name="Coherence score", x=x, y=y1, mode='lines+markers'), row=1, col=1)

fig.add_trace(go.Scatter(name="Perplexity", x=x, y=y2, mode='lines+markers'), row=2, col=1)

fig.update_layout(height=600, width=1000, title_text="Gensim LDA - Number of Topics vs. Coherence and Perplexity", xaxis1 = dict(
        tickmode = 'linear', tick0 = 1, dtick = 1), xaxis2 = dict(tickmode = 'linear', tick0 = 1, dtick = 1))
fig.write_html("Images/Gensim_optimal_new.html")
fig.show()

Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24


In [70]:
coherence = []
for k in range(2,25):
    print('Round: '+str(k))
    Lda = gensim.models.ldamodel.LdaModel
    sample_ldamodel = Lda(corpus, num_topics=k, id2word = id2word, passes=5, iterations=10, chunksize = 2)
    
    cm = gensim.models.coherencemodel.CoherenceModel(model=sample_ldamodel, texts=data_lemma, dictionary=id2word, coherence='c_v')
    pp = sample_ldamodel.log_perplexity(corpus, total_docs=73)
    coherence.append((k,cm.get_coherence(),pp))
    

# model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemma, start=2, limit=16, step=1)

x = [x[0] for x in coherence]
y1 = [x[1] for x in coherence]
y2 = [x[2] for x in coherence]

fig = make_subplots( rows=2, cols=1, subplot_titles=("Coherence score", "Perplexity"), shared_xaxes=True,
                    vertical_spacing=0.02)

fig.add_trace(go.Scatter(name="Coherence score", x=x, y=y1, mode='lines+markers'), row=1, col=1)

fig.add_trace(go.Scatter(name="Perplexity", x=x, y=y2, mode='lines+markers'), row=2, col=1)

fig.update_layout(height=600, width=1000, title_text="Gensim LDA - Number of Topics vs. Coherence and Perplexity", xaxis1 = dict(
        tickmode = 'linear', tick0 = 1, dtick = 1), xaxis2 = dict(tickmode = 'linear', tick0 = 1, dtick = 1))
fig.write_html("Images/Gensim_optimal_new.html")
fig.show()

Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24


In [114]:
mallet_coherence = []
for k in range(2,25):
    print('Round: '+str(k))
    sample_ldamodel_mallet = gensim.models.wrappers.LdaMallet(
       mallet_path, corpus=corpus, id2word=id2word)
    
    cm = gensim.models.coherencemodel.CoherenceModel(model=sample_ldamodel_mallet, texts=data_lemma, dictionary=id2word, coherence='c_v')
#     pp = sample_ldamodel_mallet.log_perplexity(corpus, total_docs=73)
    mallet_coherence.append((k,cm.get_coherence(), pp))
    
    
x_val = [x[0] for x in mallet_coherence]
y_val1 = [x[1] for x in mallet_coherence]
# y_val2 = [x[2] for x in mallet_coherence]

fig = make_subplots( rows=1, cols=1, subplot_titles=("Coherence score"), shared_xaxes=True,
                    vertical_spacing=0.02)

fig.add_trace(go.Scatter(name="Coherence score", x=x_val, y=y_val1, mode='lines+markers'), row=1, col=1)

# fig.add_trace(go.Scatter(name="Perplexity", x=x_val, y=y_val2, mode='lines+markers'), row=2, col=1)

fig.update_layout(height=600, width=1000, title_text="Mallet LDA - Number of Topics vs. Coherence and Perplexity", xaxis1 = dict(
        tickmode = 'linear', tick0 = 1, dtick = 1), xaxis2 = dict(tickmode = 'linear', tick0 = 1, dtick = 1))
fig.write_html("Images/Gensim_optimal.html")
fig.show()

Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24


# Gensim LDA Model
---

In [110]:
lda_model1 = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = id2word,
                                       num_topics = 7,
                                       chunksize = 5,
                                       passes = 10,
                                       iterations = 10,
                                       minimum_probability = 0)
lda_model1.print_topics()

[(0,
  '0.016*"water" + 0.008*"yoga" + 0.008*"story" + 0.007*"farmer" + 0.005*"village" + 0.005*"family" + 0.004*"year" + 0.004*"youth" + 0.003*"opportunity" + 0.003*"river"'),
 (1,
  '0.005*"year" + 0.005*"young" + 0.005*"day" + 0.005*"festival" + 0.004*"farmer" + 0.004*"poor" + 0.004*"work" + 0.004*"woman" + 0.004*"technology" + 0.003*"village"'),
 (2,
  '0.006*"festival" + 0.005*"woman" + 0.005*"society" + 0.005*"day" + 0.004*"child" + 0.004*"young" + 0.004*"cleanliness" + 0.004*"game" + 0.004*"effort" + 0.004*"water"'),
 (3,
  '0.010*"exam" + 0.006*"student" + 0.005*"thing" + 0.005*"family" + 0.005*"young" + 0.004*"mind" + 0.004*"child" + 0.004*"day" + 0.004*"experience" + 0.003*"opportunity"'),
 (4,
  '0.007*"yoga" + 0.007*"corona" + 0.005*"family" + 0.005*"service" + 0.005*"ayushman" + 0.004*"treatment" + 0.004*"water" + 0.004*"child" + 0.004*"day" + 0.003*"heart"'),
 (5,
  '0.006*"village" + 0.006*"family" + 0.005*"water" + 0.005*"poor" + 0.004*"day" + 0.004*"money" + 0.004*"sta

In [111]:
pyLDAvis.enable_notebook()
LDAvis1 = gensimvis.prepare(lda_model1, corpus, id2word)
pyLDAvis.save_html(LDAvis1, 'HTML/gensim_lda1_viz.html')
LDAvis1

In [112]:
print('\nPerplexity for k = 5 : ', lda_model1.log_perplexity(corpus,total_docs=74))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda1 = CoherenceModel(model=lda_model1, texts=data_lemma, dictionary=id2word , coherence='c_v')
coherence_lda1 = coherence_model_lda1.get_coherence()
print('\nCoherence Score for k = 5 : ', coherence_lda1)


Perplexity for k = 5 :  -8.19122197069738

Coherence Score for k = 5 :  0.252073241089343


In [83]:
lda_model2 = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = id2word,
                                       num_topics = 9,
                                       chunksize = 2,
                                       passes = 5,
                                       iterations=10,
                                       per_word_topics=True,
                                       minimum_probability = 0)
lda_model2.print_topics()

[(0,
  '0.005*"farmer" + 0.003*"law" + 0.003*"land" + 0.003*"village" + 0.002*"water" + 0.002*"sister" + 0.002*"yoga" + 0.002*"brother" + 0.001*"state" + 0.001*"compensation"'),
 (1,
  '0.003*"suggestion" + 0.003*"brother" + 0.003*"strength" + 0.003*"thought" + 0.003*"khaadi" + 0.003*"sheep" + 0.003*"vijay" + 0.002*"lion" + 0.002*"dashami" + 0.002*"cub"'),
 (2,
  '0.012*"yoga" + 0.006*"question" + 0.006*"barack" + 0.005*"gold" + 0.005*"water" + 0.005*"village" + 0.004*"work" + 0.004*"thing" + 0.004*"child" + 0.004*"state"'),
 (3,
  '0.019*"farmer" + 0.008*"village" + 0.008*"land" + 0.007*"brother" + 0.006*"law" + 0.006*"sister" + 0.006*"opportunity" + 0.005*"work" + 0.004*"state" + 0.004*"year"'),
 (4,
  '0.005*"child" + 0.005*"drug" + 0.004*"day" + 0.004*"kid" + 0.004*"family" + 0.004*"concern" + 0.003*"addiction" + 0.003*"thing" + 0.003*"khadi" + 0.003*"tree"'),
 (5,
  '0.023*"exam" + 0.009*"water" + 0.008*"student" + 0.008*"thing" + 0.008*"question" + 0.008*"teacher" + 0.007*"child"

In [84]:
pyLDAvis.enable_notebook()
LDAvis2 = gensimvis.prepare(lda_model2, corpus, id2word)
pyLDAvis.save_html(LDAvis2, 'HTML/gensim_lda2_viz.html')
LDAvis2

In [85]:
print('\nPerplexity for k = 10 : ', lda_model2.log_perplexity(corpus,total_docs=74))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda2 = CoherenceModel(model=lda_model2, texts=data_lemma, dictionary=id2word , coherence='c_v')
coherence_lda2 = coherence_model_lda2.get_coherence()
print('\nCoherence Score for k = 10 : ', coherence_lda2)


Perplexity for k = 10 :  -8.340231178770352

Coherence Score for k = 10 :  0.29624089791383784


### Finding optimal no of topics

In [None]:
# def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, limit, step):
#         model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
#         model_list.append(model)
#         coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherencemodel.get_coherence())

#     return model_list, coherence_values

coherence = []
for k in range(2,16):
    print('Round: '+str(k))
    Lda = gensim.models.ldamodel.LdaModel
    sample_ldamodel = Lda(corpus, num_topics=k, id2word = id2word, passes=50,\
                   iterations=100, chunksize = 5)
    
    cm = gensim.models.coherencemodel.CoherenceModel(model=sample_ldamodel, texts=data_lemma,\
                                                     dictionary=id2word, coherence='c_v')
    coherence.append((k,cm.get_coherence()))

In [None]:
# model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemma, start=2, limit=16, step=1)

# limit = 16
# start = 2
# step = 1
# x = range(start, limit, step)

x = [x[0] for x in coherence]
y = [x[1] for x in coherence]

plt.figure(figsize=(12, 6))
plt.plot(x, y)
plt.scatter(x, y)
plt.title('Gensim LDA - Number of Topics vs. Coherence')
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.savefig('Images/gensim_topics.png')
plt.show()

### Optimal topics model

In [None]:
optimal_lda_model = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = id2word,
                                       num_topics = 9,
                                       chunksize = 5,
                                       passes = 50,
                                       iterations=100,
                                       per_word_topics=True,
                                       minimum_probability = 0)
optimal_lda_model.print_topics()

In [None]:
pyLDAvis.enable_notebook()
optimal_LDAvis = gensimvis.prepare(optimal_lda_model, corpus, id2word)
pyLDAvis.save_html(optimal_LDAvis, 'HTML/gensim_lda2_viz.html')
optimal_LDAvis

In [None]:
print('\nPerplexity: ', optimal_lda_model.log_perplexity(corpus, total_docs=74))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
optimal_coherence_model_lda = CoherenceModel(model=optimal_lda_model, texts=data_lemma, dictionary=id2word , coherence='c_v')
optimal_coherence_lda = optimal_coherence_model_lda.get_coherence()
print('\nCoherence Score: ', optimal_coherence_lda)

# Mallet LDA Model
---

In [117]:
lda_mallet = gensim.models.wrappers.LdaMallet(
    mallet_path, corpus = corpus, num_topics = 7, id2word = id2word)
pprint(lda_mallet.show_topics(formatted=False))

[(0,
  [('yoga', 0.015349194167306216),
   ('cleanliness', 0.012279355333844973),
   ('programme', 0.00990023023791251),
   ('youth', 0.009363008442056791),
   ('year', 0.008825786646201074),
   ('letter', 0.008058326937835763),
   ('experience', 0.00782808902532617),
   ('man', 0.00782808902532617),
   ('power', 0.007674597083653108),
   ('city', 0.007674597083653108)]),
 (1,
  [('farmer', 0.03706369197553076),
   ('village', 0.018591819599376273),
   ('land', 0.014513614009835672),
   ('story', 0.012714405661508936),
   ('year', 0.012114669545400024),
   ('effort', 0.011275038982847548),
   ('sister', 0.0109151973131822),
   ('field', 0.010555355643516853),
   ('brother', 0.00995561952740794),
   ('work', 0.00947583063452081)]),
 (2,
  [('woman', 0.01578728707935189),
   ('game', 0.013502285002077275),
   ('young', 0.009347735770668882),
   ('youth', 0.008724553385957623),
   ('society', 0.008309098462816784),
   ('school', 0.008101371001246365),
   ('fit', 0.006439551308683008),
   

In [118]:
mallet_lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

pyLDAvis.enable_notebook()
mallet_LDAvis = gensimvis.prepare(mallet_lda_model, corpus, id2word)
pyLDAvis.save_html(mallet_LDAvis, 'HTML/mallet_lda1_viz.html')
mallet_LDAvis

### Performance

In [None]:
mallet_coherence = []
for k in range(2,16):
    print('Round: '+str(k))
    sample_ldamodel_mallet = gensim.models.wrappers.LdaMallet(
       mallet_path, corpus=corpus, id2word=id2word)
    
    cm = gensim.models.coherencemodel.CoherenceModel(model=sample_ldamodel_mallet, texts=data_lemma,\
                                                     dictionary=id2word, coherence='c_v')
    mallet_coherence.append((k,cm.get_coherence()))

In [None]:
x = [x[0] for x in mallet_coherence]
y = [x[1] for x in mallet_coherence]

plt.figure(figsize=(12, 6))
plt.plot(x, y)
plt.scatter(x, y)
plt.title('Number of Topics vs. Coherence')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence')
plt.xticks(x)
plt.savefig('Images/mallet_topics.png')
plt.show()

### Optimal topics model

In [None]:
optimal_lda_mallet = gensim.models.wrappers.LdaMallet(
    mallet_path, corpus=corpus, num_topics=5, id2word=id2word)
pprint(optimal_lda_mallet.show_topics(formatted=False))

In [None]:
optimal_mallet_lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(optimal_lda_mallet)

pyLDAvis.enable_notebook()
optimal_mallet_LDAvis = gensimvis.prepare(optimal_mallet_lda_model, corpus, id2word)
pyLDAvis.save_html(optimal_mallet_LDAvis, 'HTML/mallet_lda2_viz.html')
optimal_mallet_LDAvis

In [None]:
print(optimal_mallet_LDAvis.topic_order)