# Implementation of  *Latent Dirichlet Allocation* algorithm with Gensim and Mallet
---


### Topic modelling
Topic Modeling is a process to automatically identify topics present in a text object and to derive hidden patterns exhibited by a text corpus. Topic Models are very useful for multiple purposes, including:

- Document clustering
- Organizing large blocks of textual data
- Information retrieval from unstructured text
- Feature selection

Topic Modelling is different from rule-based text mining approaches that use regular expressions or dictionary based keyword searching techniques. It is an unsupervised approach used for finding and observing the bunch of words (called “topics”) in large clusters of texts.

### LDA - Latent Dirichlet Allocation
There are many approaches for obtaining topics from a text such as – Term Frequency and Inverse Document Frequency. NonNegative Matrix Factorization techniques. Latent Dirichlet Allocation is the most popular topic modeling technique.

In [1]:
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim.test.utils 
import spacy
import en_core_web_sm
import nltk 
#nltk.download('stopwords') - if needed
# from nltk.corpus import stopwords
from gensim.models.coherencemodel import CoherenceModel


from gensim.parsing.preprocessing import STOPWORDS
import tqdm
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
%matplotlib inline
import matplotlib.pyplot as plt

import os
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME':r'C:/new_mallet/mallet-2.0.8/'}) 
#You should update this path as per the path of Mallet directory on your system.
mallet_path = 'C:\\new_mallet\\mallet-2.0.8\\bin\\mallet' 
#You should update this path as per the path of Mallet directory on your system.

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Datasets/mann_ki_baat.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,Speech_date,Speech
0,"03 Oct, 2014","My Dear Countrymen,Today is the holy festival ..."
1,"02 Nov, 2014","My dear fellow countrymen, I am with you again..."
2,"14 Dec, 2014","My Dear Fellow Countrymen,Today I have this gr..."
3,"27 Jan, 2015","Today, Shri Barack Obama, President of the Uni..."
4,"22 Feb, 2015","Hello, my young friends. Today probably the en..."
...,...,...
68,"29 Nov, 2020","My dear countrymen,Namaskar! I want to share a..."
69,"27 Dec, 2020","My dear countrymen,Namaskar. Today is the 27th..."
70,"31 Jan, 2021","My dear countrymen,Namaskar. When I express Ma..."
71,"28 Feb, 2021","My dear countrymen,Namaskar. Yesterday was the..."


In [3]:
def remove_stopwords(texts):
    return[[word for word in simple_preprocess(str(doc)) if word not in all_stopwords_gensim]
            for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]#Turn words into lemmas

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

bigram = gensim.models.Phrases(df['Speech'], min_count = 5, threshold = 100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
all_stopwords_gensim = STOPWORDS.union(set(['time', 'new', 'mr.', 'mrs.', 'ki', 'th', 'today', 'way', 'am', 'address', 'own',
                                            'true', 'mann', 'baat', 'people', 'countryman', 'hindi', 'year', 'new', 'day',
                                            'life', 'friend', 'friends', 'month', 'world', 'many', 'dr.', 'pm', 'modi', 'ji', 
                                            'pe', 'cook', 'brinjal', 'st', 'nd', 'rd', 'sir', 'dear', 'morning', 'sunday',
                                            'version', 'speech', 'sundays', 'authoritative', 'rendering', 'english', 'original',
                                            'bag', 'suggestion', 'next', 'step', 'numerous', 'dear', 'fellow','government', 
                                            'india', 'country', 'bharat', 'indian', 'message', 'event', 'nation', 'national',
                                            'january', 'december', 'march', 'june', 'rt', 'ak', 'usman', 'usman-', 'kaniga-', 
                                            'decision', 'scheme', 'self', 'thing', 'region', 'reliant', 'countrymen','greetings',
                                            'like', 'share', 'ii', 'prime', 'minister', 'thing', 'lot', 'thought', 'issue',
                                            'great', 'good', 'big', 'day', 'year', 'entire', 'place']))

In [4]:
data_words_nostops = remove_stopwords(df['Speech'])
data_words_bigram = make_bigrams(data_words_nostops)
nlp = en_core_web_sm.load(disable = ['parser', 'ner'])
data_lemma = lemmatization(data_words_bigram, allowed_postags=['NOUN', 'ADJ'])

id2word = corpora.Dictionary(data_lemma)
#corpus
texts = data_lemma
#term document matrix
corpus = [id2word.doc2bow(text) for text in texts]

## Gensim - LDA

In [8]:
coherence = []
for k in range(2,25):
#     print('Round: '+str(k))
    Lda = gensim.models.ldamodel.LdaModel
    sample_ldamodel = Lda(corpus, num_topics=k, id2word = id2word, passes=5, iterations=8, chunksize = 2, random_state=50)
    
    cm = gensim.models.coherencemodel.CoherenceModel(model=sample_ldamodel, texts=data_lemma, dictionary=id2word, coherence='c_v')
    pp = sample_ldamodel.log_perplexity(corpus, total_docs=73)
    coherence.append((k,cm.get_coherence(),pp))
    
    
x = [x[0] for x in coherence]
y1 = [x[1] for x in coherence]
y2 = [x[2] for x in coherence]

import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=2, cols=1, subplot_titles=("Coherence score", "Perplexity"), shared_xaxes=True,
                    vertical_spacing=0.02)
fig.add_trace(go.Scatter(name="Coherence score", x=x, y=y1, mode='lines+markers'), row=1, col=1)
fig.add_trace(go.Scatter(name="Perplexity", x=x, y=y2, mode='lines+markers'), row=2, col=1)
fig.update_layout(height=600, width=1000, title_text="Gensim LDA - Number of Topics vs. Coherence and Perplexity", xaxis1 = dict(
        tickmode = 'linear', tick0 = 1, dtick = 1), xaxis2 = dict(tickmode = 'linear', tick0 = 1, dtick = 1))
fig.write_html("Images/Gensim_topics1.html")
fig.show()

In [9]:
coherence = []
for k in range(2,25):
#     print('Round: '+str(k))
    Lda = gensim.models.ldamodel.LdaModel
    sample_ldamodel = Lda(corpus, num_topics=k, id2word = id2word, passes=5, iterations=10, chunksize = 2, random_state=50)
    
    cm = gensim.models.coherencemodel.CoherenceModel(model=sample_ldamodel, texts=data_lemma, dictionary=id2word, coherence='c_v')
    pp = sample_ldamodel.log_perplexity(corpus, total_docs=73)
    coherence.append((k,cm.get_coherence(),pp))
    
x = [x[0] for x in coherence]
y1 = [x[1] for x in coherence]
y2 = [x[2] for x in coherence]

fig = make_subplots( rows=2, cols=1, subplot_titles=("Coherence score", "Perplexity"), shared_xaxes=True,
                    vertical_spacing=0.02)
fig.add_trace(go.Scatter(name="Coherence score", x=x, y=y1, mode='lines+markers'), row=1, col=1)
fig.add_trace(go.Scatter(name="Perplexity", x=x, y=y2, mode='lines+markers'), row=2, col=1)
fig.update_layout(height=600, width=1000, title_text="Gensim LDA - Number of Topics vs. Coherence and Perplexity", xaxis1 = dict(
        tickmode = 'linear', tick0 = 1, dtick = 1), xaxis2 = dict(tickmode = 'linear', tick0 = 1, dtick = 1))
fig.write_html("Images/Gensim_topics2.html")
fig.show()

In [11]:
lda_model1 = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = id2word,
                                       num_topics = 6,
                                        random_state=50,
                                       chunksize = 5,
                                       passes = 10,
                                       iterations = 10,
                                       minimum_probability = 0)
lda_model1.print_topics()

[(0,
  '0.005*"young" + 0.005*"game" + 0.004*"water" + 0.004*"year" + 0.004*"youth" + 0.004*"sport" + 0.004*"society" + 0.003*"teacher" + 0.003*"social" + 0.003*"festival"'),
 (1,
  '0.007*"farmer" + 0.007*"exam" + 0.006*"child" + 0.006*"family" + 0.005*"day" + 0.004*"water" + 0.004*"thing" + 0.004*"important" + 0.003*"student" + 0.003*"young"'),
 (2,
  '0.008*"woman" + 0.007*"festival" + 0.006*"society" + 0.005*"farmer" + 0.004*"effort" + 0.004*"light" + 0.004*"village" + 0.004*"day" + 0.004*"year" + 0.004*"guru"'),
 (3,
  '0.010*"water" + 0.005*"yoga" + 0.005*"village" + 0.005*"year" + 0.004*"farmer" + 0.004*"corona" + 0.004*"work" + 0.004*"family" + 0.004*"small" + 0.003*"youth"'),
 (4,
  '0.005*"day" + 0.005*"corona" + 0.004*"village" + 0.004*"question" + 0.004*"effort" + 0.004*"young" + 0.004*"family" + 0.003*"barack" + 0.003*"work" + 0.003*"time"'),
 (5,
  '0.005*"family" + 0.005*"festival" + 0.005*"year" + 0.005*"story" + 0.004*"yoga" + 0.004*"village" + 0.004*"young" + 0.004*"f

In [12]:
pyLDAvis.enable_notebook()
LDAvis1 = gensimvis.prepare(lda_model1, corpus, id2word)
pyLDAvis.save_html(LDAvis1, 'HTML/gensim_lda1_viz.html')
LDAvis1

In [13]:
print('\nPerplexity for k = 7 : ', lda_model1.log_perplexity(corpus,total_docs=74))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda1 = CoherenceModel(model=lda_model1, texts=data_lemma, dictionary=id2word , coherence='c_v')
coherence_lda1 = coherence_model_lda1.get_coherence()
print('\nCoherence Score for k = 7 : ', coherence_lda1)


Perplexity for k = 7 :  -8.180821366147304

Coherence Score for k = 7 :  0.24035389470860435


In [14]:
lda_model2 = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = id2word,
                                       num_topics = 9,
                                       random_state=50,
                                       chunksize = 2,
                                       passes = 5,
                                       iterations=10,
                                       per_word_topics=True,
                                       minimum_probability = 0)
lda_model2.print_topics()

[(0,
  '0.011*"cleanliness" + 0.008*"khadi" + 0.005*"child" + 0.005*"october" + 0.005*"year" + 0.005*"family" + 0.004*"day" + 0.004*"opportunity" + 0.004*"young" + 0.004*"power"'),
 (1,
  '0.008*"farmer" + 0.006*"festival" + 0.005*"year" + 0.005*"student" + 0.005*"child" + 0.004*"effort" + 0.004*"village" + 0.004*"young" + 0.004*"house" + 0.004*"time"'),
 (2,
  '0.001*"exam" + 0.001*"child" + 0.001*"opportunity" + 0.001*"farmer" + 0.001*"student" + 0.001*"thing" + 0.001*"year" + 0.001*"work" + 0.001*"festival" + 0.001*"village"'),
 (3,
  '0.005*"work" + 0.004*"lakh" + 0.004*"farmer" + 0.003*"opportunity" + 0.003*"effort" + 0.003*"tree" + 0.003*"day" + 0.003*"family" + 0.003*"year" + 0.003*"technology"'),
 (4,
  '0.008*"farmer" + 0.008*"brother" + 0.006*"barack" + 0.006*"question" + 0.006*"law" + 0.005*"village" + 0.005*"land" + 0.005*"sister" + 0.005*"child" + 0.004*"work"'),
 (5,
  '0.008*"village" + 0.008*"water" + 0.007*"yoga" + 0.006*"poor" + 0.005*"farmer" + 0.005*"year" + 0.004*"

In [15]:
pyLDAvis.enable_notebook()
LDAvis2 = gensimvis.prepare(lda_model2, corpus, id2word)
pyLDAvis.save_html(LDAvis2, 'HTML/gensim_lda2_viz.html')
LDAvis2

In [16]:
print('\nPerplexity for k = 9 : ', lda_model2.log_perplexity(corpus,total_docs=74))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda2 = CoherenceModel(model=lda_model2, texts=data_lemma, dictionary=id2word , coherence='c_v')
coherence_lda2 = coherence_model_lda2.get_coherence()
print('\nCoherence Score for k = 9 : ', coherence_lda2)


Perplexity for k = 9 :  -8.30072288706841

Coherence Score for k = 9 :  0.27739725453034025


## Mallet - LDA

In [20]:
mallet_coherence = []
for k in range(2,25):
    print('Round: '+str(k))
    sample_ldamodel_mallet = gensim.models.wrappers.LdaMallet(
       mallet_path, corpus=corpus, id2word=id2word)
    
    cm = gensim.models.coherencemodel.CoherenceModel(model=sample_ldamodel_mallet, texts=data_lemma, dictionary=id2word, coherence='c_v')
    mallet_coherence.append((k,cm.get_coherence()))
    
    
x_val = [x[0] for x in mallet_coherence]
y_val = [x[1] for x in mallet_coherence]

Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24


In [138]:
fig = make_subplots( rows=1, cols=1, subplot_titles=("Coherence score"))
fig.add_trace(go.Scatter(name="Coherence score", x=x_val, y=y_val, mode='lines+markers'), row=1, col=1)
fig.update_layout(height=600, width=1000, title_text="Mallet LDA - Number of Topics vs. Coherence", xaxis1 = dict(
        tickmode = 'linear', tick0 = 1, dtick = 1), xaxis2 = dict(tickmode = 'linear', tick0 = 1, dtick = 1))
fig.write_html("Images/Mallet_topics.html")
fig.show()

In [24]:
lda_mallet = gensim.models.wrappers.LdaMallet(
    mallet_path, corpus = corpus, num_topics = 7, id2word = id2word)
pprint(lda_mallet.show_topics(formatted=False))

[(0,
  [('poor', 0.01511268228015908),
   ('village', 0.012284577993813522),
   ('work', 0.011842686699072028),
   ('small', 0.011135660627485638),
   ('crore', 0.010870525850640743),
   ('campaign', 0.01042863455589925),
   ('lakh', 0.009544851966416261),
   ('money', 0.009279717189571365),
   ('state', 0.009102960671674768),
   ('day', 0.008484312859036678)]),
 (1,
  [('water', 0.035202322627472325),
   ('yoga', 0.020322990382870623),
   ('exam', 0.01651242968608238),
   ('child', 0.010524405733986572),
   ('student', 0.00998003992015968),
   ('thing', 0.009889312284521865),
   ('mind', 0.00943567410633279),
   ('experience', 0.008891308292505897),
   ('success', 0.008619125385592452),
   ('family', 0.008619125385592452)]),
 (2,
  [('farmer', 0.0427262313860252),
   ('village', 0.022909507445589918),
   ('sister', 0.013974799541809852),
   ('land', 0.0138602520045819),
   ('brother', 0.012829324169530355),
   ('field', 0.01122565864833906),
   ('work', 0.010194730813287515),
   ('law

In [25]:
mallet_lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

pyLDAvis.enable_notebook()
mallet_LDAvis = gensimvis.prepare(mallet_lda_model, corpus, id2word)
pyLDAvis.save_html(mallet_LDAvis, 'HTML/mallet_lda.html')
mallet_LDAvis

In [26]:
print(mallet_LDAvis.topic_order)

[5, 6, 1, 2, 7, 4, 3]


In [91]:
topic_list = {
    '2' : 'Youth',
    '3' : 'Economics',
    '5' : 'Sports & Culture',
    '6' : 'Health',
    '7' : 'Agriculture & Development'
}

In [59]:
topics_docs = list()
for m in mallet_lda_model[corpus]:
    topics_docs.append(m)

topics_docs_dict = dict()
for i in range(len(df)):
    topics_docs_dict[df.loc[i]["Speech_date"]] = [doc for (topic, doc) in topics_docs[i]]

topics_docs_df = pd.DataFrame(data=topics_docs_dict)


In [101]:
topics_docs_df = topics_docs_df.rename(index={1 : 'Youth',
    2 : 'Economics',
    4 : 'Sports & Culture',
    5 : 'Health',
    6 : 'Agriculture & Development'})

topics_docs_df = topics_docs_df.drop([0,3])
topics_docs_df

Unnamed: 0,"03 Oct, 2014","02 Nov, 2014","14 Dec, 2014","27 Jan, 2015","22 Feb, 2015","22 Mar, 2015","31 May, 2015","28 Jun, 2015","26 Jul, 2015","30 Aug, 2015",...,"28 Jun, 2020","26 Jul, 2020","30 Aug, 2020","27 Sep, 2020","25 Oct, 2020","29 Nov, 2020","27 Dec, 2020","31 Jan, 2021","28 Feb, 2021","28 Mar, 2021"
Youth,0.354254,0.132445,0.162066,0.114928,0.705507,0.091072,0.339607,0.201228,0.122877,0.070983,...,0.136387,0.097952,0.073718,0.041276,0.054828,0.058134,0.042764,0.090704,0.342614,0.058511
Economics,0.059421,0.07834,0.024663,0.030977,0.0479,0.674032,0.180472,0.055952,0.216888,0.274004,...,0.087237,0.077116,0.138549,0.184946,0.316077,0.500546,0.083693,0.257736,0.127786,0.422243
Sports & Culture,0.187285,0.097504,0.063032,0.08032,0.032122,0.016527,0.078049,0.077849,0.065473,0.08355,...,0.104356,0.104015,0.069828,0.1031,0.232979,0.093027,0.098966,0.152429,0.076761,0.125492
Health,0.141324,0.360338,0.540309,0.624644,0.090234,0.04268,0.23006,0.174801,0.155198,0.138296,...,0.065446,0.084575,0.041306,0.094758,0.08235,0.086765,0.182988,0.10344,0.102873,0.125668
Agriculture & Development,0.041553,0.054868,0.055848,0.029622,0.047161,0.022654,0.043607,0.060026,0.045738,0.046929,...,0.184152,0.107968,0.453451,0.10525,0.073434,0.07481,0.271332,0.057317,0.134494,0.116126


In [102]:
docs_topics_df = topics_docs_df.transpose()
docs_topics_df

Unnamed: 0,Youth,Economics,Sports & Culture,Health,Agriculture & Development
"03 Oct, 2014",0.354254,0.059421,0.187285,0.141324,0.041553
"02 Nov, 2014",0.132445,0.078340,0.097504,0.360338,0.054868
"14 Dec, 2014",0.162066,0.024663,0.063032,0.540309,0.055848
"27 Jan, 2015",0.114928,0.030977,0.080320,0.624644,0.029622
"22 Feb, 2015",0.705507,0.047900,0.032122,0.090234,0.047161
...,...,...,...,...,...
"29 Nov, 2020",0.058134,0.500546,0.093027,0.086765,0.074810
"27 Dec, 2020",0.042764,0.083693,0.098966,0.182988,0.271332
"31 Jan, 2021",0.090704,0.257736,0.152429,0.103440,0.057317
"28 Feb, 2021",0.342614,0.127786,0.076761,0.102873,0.134494


In [103]:
def get_dominant_topic(df, col) :
    return [df[col].idxmax(), df[col].max()]

In [104]:
topics = []
for date in list(topics_docs_df.columns):
    topic = get_dominant_topic(topics_docs_df, date)
    topics.append(topic)
    
topics

[['Youth', 0.3542541341690118],
 ['Health', 0.36033813405921955],
 ['Health', 0.5403085672932568],
 ['Health', 0.6246444492483673],
 ['Youth', 0.7055070170721602],
 ['Economics', 0.674032011190757],
 ['Youth', 0.3396068775303148],
 ['Youth', 0.20122768619251538],
 ['Economics', 0.21688833929019316],
 ['Economics', 0.2740044936720676],
 ['Sports & Culture', 0.3270500859094061],
 ['Health', 0.44715594112513957],
 ['Health', 0.2068602910233063],
 ['Health', 0.19531876722459846],
 ['Health', 0.2248132930415512],
 ['Youth', 0.7403249229164351],
 ['Agriculture & Development', 0.27920122110643764],
 ['Youth', 0.29636167916289624],
 ['Youth', 0.3649281092226516],
 ['Youth', 0.390464344094767],
 ['Sports & Culture', 0.13539535041154815],
 ['Agriculture & Development', 0.3649552230127056],
 ['Sports & Culture', 0.3865518789945233],
 ['Sports & Culture', 0.4771967884414365],
 ['Youth', 0.11313752063150313],
 ['Agriculture & Development', 0.11047480735323756],
 ['Youth', 0.5970968514084787],
 ['Ec

In [152]:
# list(topics_docs_df.columns)
monthly_topics = pd.DataFrame()
monthly_topics['Dates'] = list(topics_docs_df.columns)
monthly_topics['Topic'] = topics
monthly_topics.to_csv('monthly_topics.csv')

In [137]:
import dataframe_image as dfi

styled_docs_topics_df = docs_topics_df.style.background_gradient()
styled_docs_topics_df
dfi.export(styled_docs_topics_df, 'Images/docs_topics_monthly.png')

In [144]:
years = []
for i in range(len(docs_topics_df)) :
    years.append(docs_topics_df.index[i][-4:])
    
docs_topics_df['Years'] = years
grouped_docs_topics_df = docs_topics_df.groupby(['Years']).mean()
styled_grouped_docs_topics_df = grouped_docs_topics_df.style.background_gradient()
dfi.export(styled_grouped_docs_topics_df, 'Images/docs_topics_yearly.png')
styled_grouped_docs_topics_df

Unnamed: 0_level_0,Youth,Economics,Sports & Culture,Health,Agriculture & Development
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014,0.216255,0.054142,0.11594,0.347324,0.050757
2015,0.181398,0.165705,0.099454,0.23122,0.04433
2016,0.222154,0.07644,0.137429,0.11745,0.111553
2017,0.170859,0.061502,0.249478,0.141073,0.139577
2018,0.093335,0.132226,0.232885,0.152117,0.216483
2019,0.140934,0.088616,0.229725,0.226141,0.162747
2020,0.091588,0.171056,0.102937,0.089174,0.154333
2021,0.163943,0.269255,0.118227,0.11066,0.102646


In [142]:
import plotly.express as px 

fig = px.line(grouped_docs_topics_df, x=grouped_docs_topics_df.index, y=grouped_docs_topics_df.columns[0:])
fig.update_traces(mode='markers+lines')
fig.update_layout(title="Variation of topics over the years", yaxis_title="Count", legend_title="Mallet LDA Identified Topics")
fig.write_html("Images/Mallet_all_topics.html")
fig.show()