In [11]:
# Define the sklearn LDA model with Count Vectorizer
%run pkgs/lda_modeller.ipynb


# Plot top words per topic

def plot_top_words(model, feature_names, n_top_words = 7):
  fig, axes = plt.subplots(1, 5, figsize = (30, 10), sharex = True)
  axes = axes.flatten()
  for topic_idx, topic in enumerate(model.components_):
    top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
    top_features = [feature_names[i] for i in top_features_ind]
    weights = topic[top_features_ind]
    
    ax = axes[topic_idx]
    ax.barh(top_features, weights, height = 0.7)
    ax.set_title(f'Topic {topic_idx + 1}', fontdict = {'fontsize': 30})
    ax.invert_yaxis()
    ax.tick_params(axis = 'both', which = 'major', labelsize = 20)
    for i in 'top right left'.split():
      ax.spines[i].set_visible(False)
    fig.suptitle("LDA", fontsize = 40)
  
  plt.subplots_adjust(top = 0.90, bottom = 0.05, wspace = 0.90, hspace = 0.3)
  plt.show()

  
  
  
# Estimate model accuracy:
def est_accuracy(lda_modeller, topic_vocab):
  
  preds_col_name = 'preds ' + lda_modeller.lib
  
  if lda_modeller.lib == 'skl':
    
    new_sentences = lda_modeller.vectorizer.transform(df['text'])
    topic_distributions = lda_modeller.lda_model.transform(new_sentences)
    most_probable_topics = topic_distributions.argmax(axis=1)
    preds = [topic_vocab[num_topic] for num_topic in most_probable_topics]

    
  elif lda_modeller.lib == 'gensim':
    
    new_words = [nltk.word_tokenize(sentence) for sentence in df['text']]
    new_doc_dict = corpora.Dictionary(new_words)
    new_doc_bow = [new_doc_dict.doc2bow(doc) for doc in new_words]
    new_doc_topics = [lda_modeller.lda_model.get_document_topics(doc) for doc in new_doc_bow]
    
    preds = []
    
    for i in new_doc_topics:
      cur_pred = [-1, -1]
      for prediction in i:
        if prediction[1] > cur_pred[1]:
          cur_pred[0] = prediction[0]
          cur_pred[1] = prediction[1]
      preds.append(cur_pred[0])
    preds = [topic_vocab[num_topic] for num_topic in preds]
  
  df[preds_col_name] = preds
  acc = sum(df['labels'] == df[preds_col_name]) / len(df)  
  print(f'Supposed model accuracy: {acc}')
  
  
   
# Extract latent variables and sample a topic
def sample_topic(lda_modeller, rand_seed=0, topic_vocab=None, num_topics=5, print_article=False):
  Z = lda_modeller.lda_model.transform(lda_modeller.vectorized_sentences)
  np.random.seed(rand_seed)
  i = np.random.choice(len(df))
  z = Z[i]
  topics = np.arange(num_topics) + 1
  fig, ax = plt.subplots()
  ax.barh(topics, z)
  ax.set_yticks(topics)
  ax.set_title('True label: %s' % df.iloc[i]['labels'])
  
  if print_article:
    print(wrap(df.iloc[i]['text']))

  
# Text wrapper
def wrap(x):
  return textwrap.fill(x, replace_whitespace = False, fix_sentence_endings = True)



# Calculate the coherence and perplexity scores
def coherence_perp_logl(lda_modeller, sentences=None):
  if sentences == None:
    sentences = df['text']
    
  vocab = lda_modeller.vectorizer.get_feature_names_out()
  words = [nltk.word_tokenize(sentence) for sentence in sentences]
  id2word = corpora.Dictionary(words)
  
  topics = []
  for i in range(lda_modeller.lda_model.n_components):
    topic_words = [vocab[j] for j in lda_modeller.lda_model.components_[i].argsort()[:-10 - 1:-1]]
    topics.append(topic_words)

  coherence_model_lda = CoherenceModel(topics=topics, texts=words, dictionary=id2word, coherence='c_v')
  coherence_lda = coherence_model_lda.get_coherence()
  perplexity = np.log(lda_modeller.lda_model.perplexity(lda_modeller.vectorized_sentences))
  log_likelihood = lda_modeller.lda_model.score(lda_modeller.vectorized_sentences)
  print(f'The coherence score is: {round(coherence_lda, 3)}')
  print(f'The log perplexity score is: {round(perplexity, 3)}')
  print(f'The log likelihood is: {round(log_likelihood, 3)}')
  
  return vocab, words, id2word
  
  

# Print top words per topic  
def print_n_words_per_topic(lda_modeller, num_words=15):
  vocab = np.array(lda_modeller.vectorizer.get_feature_names_out())
  top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
  topic_words = ([top_words(t) for t in H1])
  topics = [' '.join(t) for t in topic_words]
  for topic in topics:
    print(topic)
    print('\n')
    
      
    
# Topic per document matrix
def topic_per_doc_matrix(lda_modeller):
  colnames = ['Topic' + str(i) for i in range(lda_modeller.lda_model.n_components)]
  docnames = ['Doc' + str(i) for i in range(len(df['text']))]
  df_doc_topic = pd.DataFrame(np.round(W1, 2), columns = colnames, index = docnames)
  df_doc_topic['dominant_topic'] = np.argmax(df_doc_topic.values, axis = 1)
  return df_doc_topic



# Create document-term matrix
def create_document_term_matrix(dataframe, column_name='text', vectorizer='count', stops=stops, min_df=2, max_df=0.95):
  if vectorizer == 'count':
    vectorizer = CountVectorizer(stop_words = list(stops),
                                        min_df = min_df,
                                        max_df = max_df)
  elif vectorizer == 'tfidf':
    vectorizer = TfidfVectorizer(stop_words = list(stops),
                                 min_df = min_df,
                                 max_df = max_df)
  else:
    raise ValueError('The vectorizer value can be either "count" or "tfidf"')
  
  data = vectorizer.fit_transform(dataframe[column_name])
  df_dtm = pd.DataFrame(data.toarray(), columns = vectorizer.get_feature_names_out())
  df_dtm.index = dataframe.index
  return df_dtm

NameError: name 'lda_modeller' is not defined