In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('./archive/abcnews-date-text.csv')
data = df.sample(100000)
documents = data['headline_text'].values

count_vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)

count_X = count_vectorizer.fit_transform(documents)
tfidf_X = tfidf_vectorizer.fit_transform(documents)

n_topics = 10

def print_top_words_with_counts(model, feature_names, n_top_words, X):
    topic_assignments = model.transform(X).argmax(axis=1)
    topic_counts = np.bincount(topic_assignments)

    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        message += f" | # of headlines: {topic_counts[topic_idx]}"
        print(message)

In [3]:
nmf_count = NMF(n_components=n_topics)
nmf_tfidf = NMF(n_components=n_topics)
nmf_count.fit(count_X)
nmf_tfidf.fit(tfidf_X)

In [4]:
n_top_words = 5

print("\nNMF with CountVectorizer:")
print_top_words_with_counts(nmf_count, count_vectorizer.get_feature_names_out(), n_top_words, count_X)

print("\nNMF with TfidfVectorizer:")
print_top_words_with_counts(nmf_tfidf, tfidf_vectorizer.get_feature_names_out(), n_top_words, tfidf_X)


NMF with CountVectorizer:
Topic #0: police investigate probe missing death | # of headlines: 4276
Topic #1: new zealand york cases laws | # of headlines: 3293
Topic #2: man charged murder jailed crash | # of headlines: 8024
Topic #3: says minister mp labor pm | # of headlines: 4685
Topic #4: govt urged health qld wa | # of headlines: 11986
Topic #5: court accused face murder told | # of headlines: 8089
Topic #6: australia day world south cup | # of headlines: 20055
Topic #7: council plan water coast rise | # of headlines: 14088
Topic #8: nsw rural coast sydney coronavirus | # of headlines: 22066
Topic #9: interview extended michael david nrl | # of headlines: 3438

NMF with TfidfVectorizer:
Topic #0: man charged murder jailed missing | # of headlines: 4210
Topic #1: police investigate probe missing search | # of headlines: 4228
Topic #2: govt says council health plan | # of headlines: 46853
Topic #3: interview extended michael david john | # of headlines: 2921
Topic #4: rural news nat

In [5]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

documents = [word_tokenize(doc.lower()) for doc in documents]



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lmh23\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# evaluate the model
# 1. coherence score
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

def get_topics(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        print(topic)
        topics.append(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

    return topics

def get_coherence_score(documents, topics):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    cm = CoherenceModel(topics=topics, texts=documents, dictionary=dictionary, coherence='c_v')
    return cm.get_coherence()

count_topics = get_topics(nmf_count, count_vectorizer.get_feature_names_out(), n_top_words)
tfidf_topics = get_topics(nmf_tfidf, tfidf_vectorizer.get_feature_names_out(), n_top_words)
print(f"\nCoherence score for NMF with CountVectorizer: {get_coherence_score(documents, count_topics)}")
print(f"Coherence score for NMF with TfidfVectorizer: {get_coherence_score(documents, tfidf_topics)}")



[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 2.67080873e-03
 0.00000000e+00 2.49909190e-05]
[0.00000000e+00 0.00000000e+00 3.50582456e-05 ... 0.00000000e+00
 4.56904235e-05 5.18754676e-05]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
 8.30529829e-07 0.00000000e+00]
[0.00000000e+00 0.00000000e+00 1.50094502e-04 ... 5.46596493e-05
 9.72234203e-05 5.20005253e-05]
[8.28911921e-04 3.86808520e-04 8.78515827e-04 ... 8.40754919e-05
 6.67871090e-05 2.16072157e-04]
[3.69425497e-03 0.00000000e+00 0.00000000e+00 ... 4.83683763e-04
 9.86280524e-05 5.02979530e-05]
[0.00013018 0.         0.         ... 0.00071786 0.00042114 0.00018934]
[2.98894032e-04 0.00000000e+00 0.00000000e+00 ... 9.55299698e-05
 5.41949380e-05 6.11878955e-05]
[1.44904616e-04 5.33289069e-03 1.26287547e-03 ... 0.00000000e+00
 7.75223948e-05 3.28561808e-05]
[0.00000000e+00 0.00000000e+00 2.13940017e-05 ... 1.51620795e-05
 1.07358489e-04 1.66016983e-05]
[0. 0. 0. ... 0. 0. 0.]
[0.00000000e+00 0.00000000e+00 

In [7]:
# 2. perplexity for NMF
print(f"\nPerplexity for NMF with CountVectorizer: {nmf_count.reconstruction_err_}")
print(f"Perplexity for NMF with TfidfVectorizer: {nmf_tfidf.reconstruction_err_}")



Perplexity for NMF with CountVectorizer: 697.1292138075407
Perplexity for NMF with TfidfVectorizer: 313.07241925715147


In [9]:
# coverage
def get_topic_coverage(topic_matrix, document_term_matrix, threshold):
    '''
    returns the proportion of documents that are 
    assigned a topic with a proportion greater than 
    the threshold
    '''
    topic_proportions = np.max(topic_matrix, axis=1)
    return np.sum(topic_proportions > threshold) / len(topic_proportions)
print(count_X)

print(f"\nCoverage for NMF with CountVectorizer: {get_topic_coverage(nmf_count.transform(count_X), count_X, 0.1)}")
print(f"Coverage for NMF with TfidfVectorizer: {get_topic_coverage(nmf_tfidf.transform(tfidf_X), tfidf_X, 0.1)}")

  (0, 7283)	1
  (0, 7958)	1
  (0, 5179)	1
  (0, 19478)	1
  (0, 14462)	1
  (0, 17341)	1
  (0, 5355)	1
  (1, 11795)	1
  (1, 5332)	1
  (1, 13409)	1
  (1, 12277)	1
  (1, 11260)	1
  (1, 2908)	1
  (1, 9778)	1
  (2, 8983)	1
  (2, 4749)	1
  (2, 8367)	1
  (2, 19378)	1
  (2, 16399)	1
  (2, 19375)	1
  (3, 8674)	1
  (3, 1990)	1
  (3, 17188)	1
  (3, 6669)	1
  (3, 9199)	1
  :	:
  (99995, 10733)	1
  (99995, 7625)	1
  (99995, 3564)	1
  (99995, 7223)	1
  (99995, 6823)	1
  (99996, 12095)	1
  (99996, 15215)	1
  (99996, 20913)	1
  (99996, 10983)	1
  (99997, 17439)	1
  (99997, 9891)	1
  (99997, 14204)	1
  (99997, 11107)	1
  (99998, 3351)	1
  (99998, 14114)	1
  (99998, 17534)	1
  (99998, 15548)	1
  (99998, 10394)	1
  (99999, 8437)	1
  (99999, 16990)	1
  (99999, 18424)	1
  (99999, 9455)	1
  (99999, 17344)	1
  (99999, 2160)	1
  (99999, 2028)	1

Coverage for NMF with CountVectorizer: 0.17247
Coverage for NMF with TfidfVectorizer: 0.02252
