# TOPIC MODELING (via Latent Dirichlet Allocation)

## Import packages

In [None]:
#the module 'sys' allows istalling module from inside Jupyter
import sys

!{sys.executable} -m pip install numpy
import numpy as np

!{sys.executable} -m pip install pandas
import pandas as pd

#Natrual Language ToolKit (NLTK)
!{sys.executable} -m pip install nltk
import nltk

!{sys.executable} -m pip install sklearn
from sklearn import metrics
#from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import  CountVectorizer #bag-of-words vectorizer 
from sklearn.decomposition import LatentDirichletAllocation #package for LDA

# Plotting tools

from pprint import pprint
!{sys.executable} -m pip install pyLDAvis #visualizing LDA
import pyLDAvis
import pyLDAvis.sklearn

import matplotlib.pyplot as plt
%matplotlib inline

#define text normalization function
%run ./Text_Normalization_Function.ipynb #defining text normalization function

#ignore warnings about future changes in functions as they take too much space
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

#calcualte coherence score for evaluating LDA model
#The sklearn module does not have the functionality to compute the coherence score. Let's install the gensim package and the functions needed
!{sys.executable} -m pip install gensim
import gensim
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
     

## Code 


### Calculate coherence score

In [None]:
def coherence_score(normalized_corpus,lda_model,metric = 'u_mass'):
     
     #tokenizing the corpus
     news_corpus_tokenized = [tokenize_text(normalized_corpus[doc_id]) for doc_id in range(len(normalized_corpus))]

     #Dictionary of the corpus:
     news_dictionary = Dictionary(news_corpus_tokenized)

     #Bag-of-words representation for each document of the corpus:
     news_corpus_bow = [news_dictionary.doc2bow(doc) for doc in news_corpus_tokenized]

     def get_topic_words(vectorizer, lda_model, n_words):
         keywords = np.array(vectorizer.get_feature_names())
         topic_words = []
         for topic_weights in lda_model.components_:
             top_word_locs = (-topic_weights).argsort()[:n_words]
             topic_words.append(keywords.take(top_word_locs).tolist())
         return topic_words

     #top 20 words for each topic (using the function defined in session prep)
     topic_topwords = get_topic_words(vectorizer = bow_vectorizer_news, lda_model = lda_model, n_words=20)
     
     cm = CoherenceModel(topics=topic_topwords, #An array of top words for each topic
                         corpus = news_corpus_bow , #Corpus with each document represented as Bag-of-Words
                         dictionary = news_dictionary, #Dictionary of the corpus
                         coherence = metric)
     #We use one of the coherence metrics "u-mass" which measures semantic similarity of words in a topic, but there are other metrics as well.
     #*Note: You can check out different coherence metrics here if you are interested: https://dl.acm.org/doi/abs/10.1145/2684822.2685324*

     #print("Coherence score for the model: ", np.round(cm.get_coherence(), 4))  # get coherence value
     #print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),4))
     
     return np.round(cm.get_coherence(), 4)

### Select topic number, build & evaluate the LDA model

In [None]:
def lda_score(normalized_corpus,vectorized_corpus,topic_num,max_iter=100,doc_topic_prior = 0.25,topic_word_prior = 0.25,metric = 'u_mass'):
     
     #build lda model
     lda_model = LatentDirichletAllocation(n_components=topic_num, 
                                          max_iter=max_iter,
                                          doc_topic_prior = doc_topic_prior,
                                          topic_word_prior = topic_word_prior).fit(vectorized_corpus)
     
     log_likelihood = lda_model.score(vectorized_corpus)
     perplexity = lda_model.perplexity(vectorized_corpus)
     coherence = coherence_score(normalized_corpus,lda_model,metric)
     
     return log_likelihood,perplexity,coherence
     

### Use forloop to build several LDA models and compare performances on three metrics

In [None]:
def lda_models(lower_value,upper_value,normalized_corpus,vectorized_corpus):
     
     chart = pd.DataFrame()
          
     for i in range(lower_value,upper_value):
          chart["model{0}".format(i)] = [round(i,4) for i in list(lda_score(normalized_corpus_news,bow_news_corpus,i))]
     
     ind = pd.Series(['log_likelihood_score','perplexity_score','coherence_score'])
     chart = chart.set_index(ind)

     ind_lst = []
     for index in chart.index:
          if index in ('log_likelihood_score','coherence_score'):
               best_model_ind = np.argmax(chart.loc[index,:].values)
          else:
               best_model_ind = np.argmin(chart.loc[index,:].values)
               
          ind_lst.append(best_model_ind)
     
     chart['best_model'] = [chart.columns[ind] for ind in ind_lst]
     
     return chart

### Check the words distribution in each cluster

If you gonna select a model you'd like to dive deeper, please use the code below.

In [None]:
def display_topics(model, feature_names, no_top_words):
          for topic_idx, topic in enumerate(model.components_):
               print("Topic %d:" % (topic_idx))
               print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

Or if you trust the metric's measurement, just use the code below to look into the best LDA model chosed by 3 metrics.

In [None]:
def display_best_model_topic(chart,vectorized_corpus,vectorizer,num_top_words):
     best_lst = list(chart.best_model)
     best_model = max(best_lst,key=best_lst.count) 
     optimal_topic_num = int(best_model[-1])
     
     lda_model = LatentDirichletAllocation(n_components=optimal_topic_num, 
                                     max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(vectorized_corpus)
     
     return display_topics(lda_model,vectorizer.get_feature_names(),num_top_words)

# Data Example

In [None]:
# from sklearn.datasets import fetch_20newsgroups
# categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
# dataset = fetch_20newsgroups(shuffle=True, 
#                              random_state=1, 
#                              categories = categories, 
#                              remove=('headers', 'footers', 'quotes'))

# news_corpus = dataset.data

# #normalize data
# normalized_corpus_news = normalize_corpus(news_corpus)

# #define a Bag-of-Words vecgtorizer
# bow_vectorizer_news = CountVectorizer(max_features=1000)

# #vectorize data
# bow_news_corpus = bow_vectorizer_news.fit_transform(normalized_corpus_news)

In [117]:
# test
# lda_models(2,5,normalized_corpus_news,bow_news_corpus)

Unnamed: 0,model2,model3,model4,best_model
log_likelihood_score,-751912.6647,-743382.2788,-741007.5149,model4
perplexity_score,629.2006,584.8401,573.0568,model4
coherence_score,-1.5082,-1.4471,-1.4562,model3


In [None]:
# display_best_model_topic(chart,bow_news_corpus,bow_vectorizer_news,15)

Topic 0:
think know people like jesus good could time thing god even take well give way
Topic 1:
space nasa launch satellite system orbit year use mission earth shuttle data lunar program moon
Topic 2:
image file use edu program software graphic format jpeg ftp data available color mail system
Topic 3:
god people believe atheist religion think argument atheism exist many christian use point belief must


<br>**NOTE:** The script can vary both parameters of the Dirichlet distributions and the number of topics, or just the number of topics. In this script, I just use number of topics as a parameter, so later you could add Dirichlet parameters according to your perference.

In [None]:
# #run this visualization on Colab notebook
# #prepare to display result in the Jupyter notebook
# pyLDAvis.enable_notebook()

# #run the visualization [mds is a function to use for visualizing the "distance" between topics]
# pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')