In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('./archive/abcnews-date-text.csv')
data = df.sample(100000)
documents = data['headline_text'].values

count_vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)

count_X = count_vectorizer.fit_transform(documents)
tfidf_X = tfidf_vectorizer.fit_transform(documents)

n_topics = 10

def print_top_words_with_counts(model, feature_names, n_top_words, X):
    topic_assignments = model.transform(X).argmax(axis=1)
    topic_counts = np.bincount(topic_assignments)

    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        message += f" | # of headlines: {topic_counts[topic_idx]}"
        print(message)

In [3]:
nmf_count = NMF(n_components=n_topics)
nmf_tfidf = NMF(n_components=n_topics)
nmf_count.fit(count_X)
nmf_tfidf.fit(tfidf_X)

In [4]:
n_top_words = 5

print("\nNMF with CountVectorizer:")
print_top_words_with_counts(nmf_count, count_vectorizer.get_feature_names_out(), n_top_words, count_X)

print("\nNMF with TfidfVectorizer:")
print_top_words_with_counts(nmf_tfidf, tfidf_vectorizer.get_feature_names_out(), n_top_words, tfidf_X)


NMF with CountVectorizer:
Topic #0: police probe investigate death crash | # of headlines: 4414
Topic #1: new zealand hospital york cases | # of headlines: 3446
Topic #2: man charged murder dies crash | # of headlines: 9773
Topic #3: says minister mp report government | # of headlines: 4941
Topic #4: govt urged wa qld health | # of headlines: 12211
Topic #5: court accused face told murder | # of headlines: 8781
Topic #6: australia day south world cup | # of headlines: 21980
Topic #7: council plan water rise coast | # of headlines: 16566
Topic #8: nsw rural coast country hour | # of headlines: 14396
Topic #9: interview extended michael nrl john | # of headlines: 3492

NMF with TfidfVectorizer:
Topic #0: man charged murder missing jailed | # of headlines: 4334
Topic #1: police probe investigate missing death | # of headlines: 5321
Topic #2: govt council says plan health | # of headlines: 43389
Topic #3: interview extended michael john nrl | # of headlines: 3026
Topic #4: abc news rural 

In [11]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

documents = [word_tokenize(doc.lower()) for doc in documents]



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lmh23\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# evaluate the model
# 1. coherence score
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

def get_topics(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        topics.append(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    return topics

def get_coherence_score(documents, topics):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    cm = CoherenceModel(topics=topics, texts=documents, dictionary=dictionary, coherence='c_v')
    return cm.get_coherence()

count_topics = get_topics(nmf_count, count_vectorizer.get_feature_names_out(), n_top_words)
tfidf_topics = get_topics(nmf_tfidf, tfidf_vectorizer.get_feature_names_out(), n_top_words)

print(f"\nCoherence score for NMF with CountVectorizer: {get_coherence_score(documents, count_topics)}")
print(f"Coherence score for NMF with TfidfVectorizer: {get_coherence_score(documents, tfidf_topics)}")




Coherence score for NMF with CountVectorizer: 0.3261972013380249
Coherence score for NMF with TfidfVectorizer: 0.3088138487892074


In [14]:
# 2. perplexity for NMF
print(f"\nPerplexity for NMF with CountVectorizer: {nmf_count.reconstruction_err_}")
print(f"Perplexity for NMF with TfidfVectorizer: {nmf_tfidf.reconstruction_err_}")



Perplexity for NMF with CountVectorizer: 696.7461440010868
Perplexity for NMF with TfidfVectorizer: 313.03293176949614


In [19]:
# coverage
def get_topic_coverage(topic_matrix, document_term_matrix, threshold):
    '''
    returns the proportion of documents that are 
    assigned a topic with a proportion greater than 
    the threshold
    '''
    topic_proportions = np.max(topic_matrix, axis=1)
    return np.sum(topic_proportions > threshold) / len(topic_proportions)

print(f"\nCoverage for NMF with CountVectorizer: {get_topic_coverage(nmf_count.transform(count_X), count_X, 0.1)}")
print(f"Coverage for NMF with TfidfVectorizer: {get_topic_coverage(nmf_tfidf.transform(tfidf_X), tfidf_X, 0.1)}")


Coverage for NMF with CountVectorizer: 0.17353
Coverage for NMF with TfidfVectorizer: 0.0248
