In [11]:
# Plot top words per topic

def plot_top_words(model, feature_names, n_top_words = 7):
  fig, axes = plt.subplots(1, 5, figsize = (30, 10), sharex = True)
  axes = axes.flatten()
  for topic_idx, topic in enumerate(model.components_):
    top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
    top_features = [feature_names[i] for i in top_features_ind]
    weights = topic[top_features_ind]
    
    ax = axes[topic_idx]
    ax.barh(top_features, weights, height = 0.7)
    ax.set_title(f'Topic {topic_idx + 1}', fontdict = {'fontsize': 30})
    ax.invert_yaxis()
    ax.tick_params(axis = 'both', which = 'major', labelsize = 20)
    for i in 'top right left'.split():
      ax.spines[i].set_visible(False)
    fig.suptitle("LDA", fontsize = 40)
  
  plt.subplots_adjust(top = 0.90, bottom = 0.05, wspace = 0.90, hspace = 0.3)
  plt.show()

  
  
  
# Estimate model accuracy:
def est_accuracy(topic_vocab=None):
  new_sentences = lda_modeller.vectorizer.transform(df['text'])
  topic_distributions = lda.transform(new_sentences)
  most_probable_topics = topic_distributions.argmax(axis=1)
  preds = [topic_vocab[num_topic] for num_topic in most_probable_topics]
  df['preds'] = preds
  acc = sum(df['labels'] == df['preds']) / len(df)
  print(f'Supposed model accuracy: {acc}')
  
  
   
# Extract latent variables and sample a topic
def sample_topic(rand_seed=0, topic_vocab=None, num_topics=5, print_article=False):
  Z = lda.transform(lda_modeller.vectorized_sentences)
  np.random.seed(rand_seed)
  i = np.random.choice(len(df))
  z = Z[i]
  topics = np.arange(num_topics) + 1
  fig, ax = plt.subplots()
  ax.barh(topics, z)
  ax.set_yticks(topics)
  ax.set_title('True label: %s' % df.iloc[i]['labels'])
  
  if print_article:
    print(wrap(df.iloc[i]['text']))

  
# Text wrapper
def wrap(x):
  return textwrap.fill(x, replace_whitespace = False, fix_sentence_endings = True)



# Calculate the coherence and perplexity scores
def coherence_perp_logl(sentences=None, vectorizer=None):
  if sentences == None:
    sentences = df['text']
    
  if vectorizer == None:
    vectorizer = lda_modeller.vectorizer
    
  vocab = vectorizer.get_feature_names_out()
  words = [nltk.word_tokenize(sentence) for sentence in sentences]
  id2word = corpora.Dictionary(words)
  
  topics = []
  for i in range(lda.n_components):
      topic_words = [vocab[j] for j in lda.components_[i].argsort()[:-10 - 1:-1]]
      topics.append(topic_words)

  coherence_model_lda = CoherenceModel(topics=topics, texts=words, dictionary=id2word, coherence='c_v')
  coherence_lda = coherence_model_lda.get_coherence()
  perplexity = np.log(lda.perplexity(lda_modeller.vectorized_sentences))
  log_likelihood = lda.score(lda_modeller.vectorized_sentences)
  print(f'The coherence score is: {round(coherence_lda, 3)}')
  print(f'The log perplexity score is: {round(perplexity, 3)}')
  print(f'The log likelihood is: {round(log_likelihood, 3)}')
  
  return vocab, words, id2word
  

  

  
def metrics_model_selection(dictionary, words, limit=10, start=2, step=1, topic_word_prior=0.6):
  
    coherence_values = []
    model_list = []
    perplexity_list = []
    log_likelihood_scores = []
    
    for num_topics in range(start, limit, step):
        model = LatentDirichletAllocation(n_components = num_topics, max_iter = 100, topic_word_prior = topic_word_prior
#                                          learning_method='online'
                                         )
        model.fit(lda_modeller.vectorized_sentences) 
        model_list.append(model)
        
        topics = []
        for i in range(model.n_components):
            topic_words = [vocab[j] for j in model.components_[i].argsort()[:-10 - 1:-1]]
            topics.append(topic_words)
        
        coherencemodel = CoherenceModel(topics=topics, texts=words, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        
        perplexity = np.log(model.perplexity(lda_modeller.vectorized_sentences))
        perplexity_list.append(perplexity)
        
        log_likelihood = model.score(lda_modeller.vectorized_sentences)
        log_likelihood_scores.append(log_likelihood)
        
        
    fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 4))

    x = range(start, limit, step)

    axs[0].plot(x, coherence_values)
    axs[0].set_title('Coherence')
    axs[0].set_xlabel('Num Topics')
    axs[0].set_ylabel('Coherence Score')

    axs[1].plot(x, perplexity_list)
    axs[1].set_title('Perplexity')
    axs[1].set_xlabel('Num Topics')
    axs[1].set_ylabel('Perplexity Score')

    axs[2].plot(x, log_likelihood_scores)
    axs[2].set_title('Log likelihood')
    axs[2].set_xlabel('Num Topics')
    axs[2].set_ylabel('Log-likelihood Score')

    return model_list, coherence_values, perplexity_list, log_likelihood_scores
  
  

# Print top words per topic  
def print_n_words_per_topic(num_words=15):
  vocab = np.array(lda_modeller.vectorizer.get_feature_names_out())
  top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
  topic_words = ([top_words(t) for t in H1])
  topics = [' '.join(t) for t in topic_words]
  for topic in topics:
    print(topic)
    print('\n')
    
      
    
# Topic per document matrix
def topic_per_doc_matrix():
  colnames = ['Topic' + str(i) for i in range(lda.n_components)]
  docnames = ['Doc' + str(i) for i in range(len(df['text']))]
  df_doc_topic = pd.DataFrame(np.round(W1, 2), columns = colnames, index = docnames)
  df_doc_topic['dominant_topic'] = np.argmax(df_doc_topic.values, axis = 1)
  return df_doc_topic





NameError: name 'lda_modeller' is not defined