In [None]:
# access google drive folder
%cd /content/drive/MyDrive/Yoga_Classes

# running the setup file containing basic libraries and functions
%run 'notebooks/scripts/setup.ipynb'

In [None]:
# import libraries used for LDA modeling and model evaluation
import gensim
from gensim.models import CoherenceModel
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

!pip install pyLDAvis==2.1.2
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [None]:
def prepare_lda_inputs(text_series, no_below=15):

  # create list of texts tokenized
  texts = [text.split() for text in text_series]

  # create a word to id dictionary of words contained in the text corpus
  id2word = corpora.Dictionary(texts)
  id2word.filter_extremes(no_below=no_below)

  # create corpus / bag of words
  corpus = [id2word.doc2bow(text) for text in texts]

  return texts, id2word, corpus

In [None]:
def produce_lda_model(corpus, id2word, num_topics):

  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                              id2word=id2word,
                                              num_topics=num_topics,
                                              random_state=100,
                                              chunksize=200,
                                              passes=10,
                                              per_word_topics=True)

  return lda_model

In [None]:
def get_lda_coherence(lda_model, texts, id2word, coherence='c_v'):
  coherence_model_lda = CoherenceModel(model = lda_model, texts=texts, dictionary=id2word, coherence=coherence)
  coherence_lda = coherence_model_lda.get_coherence()

  return coherence_model_lda, coherence_lda

# https://www.baeldung.com/cs/topic-modeling-coherence-score

In [None]:
def compare_lda_coherence(dictionary, corpus, texts, start, limit, step):
  # create lists to store results to compare
  coherence_values = []
  model_list = []
  coherence_model_list = []
  topic_numbers = []

  # iterate over defined number of topics and produce lda and coherence models
  for num_topics in range(start, limit, step):
    topic_numbers.append(num_topics)
    model = produce_lda_model(corpus, dictionary, num_topics)
    model_list.append(model)
    coherence_model, coherence_value = get_lda_coherence(model, texts, dictionary)
    coherence_model_list.append(coherence_model)
    coherence_values.append(coherence_value)

  # find best coherence value
  best_coherence_value = max(coherence_values)
  best_index = coherence_values.index(best_coherence_value)
  best_topic_num = topic_numbers[best_index]

  print(f'Best coherence value of {best_coherence_value} was reached for the LDA model with {best_topic_num} topics')
  print(f'Index of the corresponding model: {best_index}')


  # plot results to compare
  x = range(start, limit, step)
  plt.plot(x, coherence_values)
  plt.show()

  return model_list, coherence_model_list, coherence_values


In [None]:
def get_lda_distance_map(lda_model, corpus, id2word):
  LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

  return LDAvis_prepared

In [None]:
def plot_topics_coherence_lda(coherence_model_lda):

  coherence_per_topic = coherence_model_lda.get_coherence_per_topic()

  topics_str = [ 1,2,3,4,5,6,7 ]
  data_topic_score = pd.DataFrame( data=zip(topics_str, coherence_per_topic), columns=['Topic', 'Coherence'] )
  data_topic_score = data_topic_score.set_index('Topic')

  fig, ax = plt.subplots( figsize=(2,6) )
  ax.set_title("Topics coherence\n $C_v$")
  sns.heatmap(data=data_topic_score, annot=True, square=True,
              cmap='Reds', fmt='.2f',
              linecolor='black', ax=ax )
  plt.yticks( rotation=0 )
  ax.set_xlabel('')
  ax.set_ylabel('')
  fig.show()