In [1]:
%%capture
%run full_setup.py

In [2]:
from tfidf_corpus_dictionary import get_tfidf_tokendocs_corpus_dict

from gensim.models import LdaModel, LsiModel, CoherenceModel
from sklearn.decomposition import NMF, PCA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
import numpy as np
from scipy import sparse

With this function we get various objects needed for modelling:
1. TFIDF matrix as input data, with specified parameters
2. feature names as the words retained with TFIDF
3. tokenized documents, a list of lists, where the inner lists contain the tokens for each document
4. corpus, gensim object needed for modelling with that package
4. dictionary, gensim object containing informations on the words of the corpus and their positions

In [3]:
tfidf_matrix, feature_names, tokenized_docs, corpus, dictionary = get_tfidf_tokendocs_corpus_dict(df, max_df=0.5, min_df=5, max_features=5000)

Now we'll evaluate different topic models based on Coherence score.

Coherence is a metric used to evalute topics quality. The higher the coherence, the better the model did in creating the topics.

For every model we'll use a function to retrieve for different numbers of topics the coherence (5, 10, 15, 20, 50). This information will be used to evaluate how the models performed as the number of topics changes

In [5]:
from coherence_topics import coherence_topics

In [5]:
evaluation = dict()
models = ['LDA', 'LSA', 'NMF', 'PCA']

for mod in models:
    metrics = coherence_topics(model_name=mod, corpus=corpus, dictionary=dictionary,
                               texts=tokenized_docs, feature_names=feature_names, tfidf=tfidf_matrix)
    evaluation[mod] = metrics

In [6]:
evaluation['LDA']

[(5, 0.545979304624336),
 (10, 0.5050314716993893),
 (15, 0.5285162980465483),
 (20, 0.5134451552900542),
 (50, 0.4630950783156206)]

In [7]:
evaluation['LSA']

[(5, 0.4455656615884063),
 (10, 0.49652711581080433),
 (15, 0.43548502778016585),
 (20, 0.3849896198652013),
 (50, 0.32808741975234135)]

In [8]:
evaluation['NMF']

[(5, 0.3597199653853072),
 (10, 0.35971996538530726),
 (15, 0.35971996538530715),
 (20, 0.35971996538530715),
 (50, 0.35971996538530715)]

In [None]:
evaluation['PCA']

Now we'll fit the LDA model with the number of topics that yields the highest coherence

In [None]:
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5,
                     alpha='symmetric', eta='auto', passes=5, random_state=1)

In [None]:
for topic in lda_model.print_topics(num_words=15):
    topic_index, words = topic
    word_list = [word.split("*")[1].strip().strip('"') for word in words.split(" + ")]
    print(f"Topic {topic_index}: {', '.join(word_list)}")

In [None]:
import pyLDAvis, pyLDAvis.gensim

In [None]:
# Visualize the LDA model using pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='tsne')
pyLDAvis.display(vis)

Now we're going to do the same for LSA

In [None]:
lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=5)

In [None]:
for topic in lsi_model.print_topics():
    topic_index, words = topic
    word_list = [word.split("*")[1].strip().strip('"') for word in words.split(" + ")]
    print(f"Topic {topic_index}: {', '.join(word_list)}")