In [27]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

# Latent Dirichlet Allocation

In [4]:
npr = pd.read_csv('./data/npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [7]:
# max_df ignore terms that have a document frequency strictly higher than the given threshold, show up too frequent
# min_df ignore terms that have a document frequency strictly lower than the given threshold, too rare
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(npr['Article'])
LDA = LatentDirichletAllocation(n_components=10, random_state=42)
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [14]:
# get the number of vocab
print(len(cv.get_feature_names()))
print(cv.get_feature_names()[6000])

54777
blake


In [15]:
# 10 topic, with assigned probabilities to each vocab
LDA.components_.shape

(10, 54777)

In [16]:
LDA.components_

array([[5.11072577e+00, 1.94461867e+03, 1.00001806e-01, ...,
        1.00005562e-01, 1.00000000e-01, 1.00001005e-01],
       [7.90134677e+00, 9.65122359e+01, 1.00000000e-01, ...,
        1.00000000e-01, 1.00000000e-01, 1.00005679e-01],
       [5.31852874e+00, 3.00433521e+02, 1.00000000e-01, ...,
        6.09994114e+00, 2.09990395e+00, 2.09985534e+00],
       ...,
       [2.70759028e+01, 5.39924878e+02, 1.00000000e-01, ...,
        1.00009334e-01, 1.00000000e-01, 1.00000000e-01],
       [1.00044538e-01, 3.22049903e+02, 1.00000000e-01, ...,
        1.00002615e-01, 1.00002538e-01, 1.00002208e-01],
       [2.79006612e-01, 5.87022720e+02, 1.00000000e-01, ...,
        1.00005607e-01, 1.00006588e-01, 1.00000000e-01]])

In [17]:
single_topic = LDA.components_[0]
# returns the indices of that Top 5 words for this topic with highest probabilities.
top_word_indices = single_topic.argsort()[-5:]
for index in top_word_indices:
    print(cv.get_feature_names()[index])

million
health
company
said
says


In [19]:
# users got to dedice the topic name
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-5:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['million', 'health', 'company', 'said', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['russia', 'house', 'president', 'said', 'trump']


THE TOP 15 WORDS FOR TOPIC #2
['food', 'new', 'just', 'like', 'says']


THE TOP 15 WORDS FOR TOPIC #3
['women', 'world', 'like', 'people', 'says']


THE TOP 15 WORDS FOR TOPIC #4
['law', 'state', 'said', 'court', 'says']


THE TOP 15 WORDS FOR TOPIC #5
['think', 'music', 'people', 'just', 'like']


THE TOP 15 WORDS FOR TOPIC #6
['new', 'like', 'students', 'school', 'says']


THE TOP 15 WORDS FOR TOPIC #7
['says', 'reports', 'people', 'police', 'said']


THE TOP 15 WORDS FOR TOPIC #8
['campaign', 'people', 'said', 'clinton', 'trump']


THE TOP 15 WORDS FOR TOPIC #9
['patients', 'study', 'people', 'health', 'says']




In [20]:
# output for each article
topic_results = LDA.transform(dtm)
topic_results[0].shape # contain probabilities for each topic

(10,)

In [23]:
npr['Topic'] = topic_results.argmax(axis=1)
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",6


# Non-negative Matrix Factorization

Use NMF(highest coefficients) instead of LDA (highest probabilites).

In [24]:
npr = pd.read_csv('./data/npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [28]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(npr['Article'])
nmf_model = NMF(n_components=7, random_state=42)
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [30]:
print(len(tfidf.get_feature_names()))
print(len(nmf_model.components_))

54777
7


In [33]:
single_topic = nmf_model.components_[0]
top_word_indices = single_topic.argsort()[-5:]
[tfidf.get_feature_names()[x] for x in top_word_indices]

['water', 'food', 'people', 'zika', 'says']

In [34]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 5 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-5:]])
    print('\n')

THE TOP 5 WORDS FOR TOPIC #0
['water', 'food', 'people', 'zika', 'says']


THE TOP 5 WORDS FOR TOPIC #1
['donald', 'campaign', 'said', 'president', 'trump']


THE TOP 5 WORDS FOR TOPIC #2
['coverage', 'medicaid', 'insurance', 'care', 'health']


THE TOP 5 WORDS FOR TOPIC #3
['president', 'reports', 'court', 'said', 'police']


THE TOP 5 WORDS FOR TOPIC #4
['hillary', 'campaign', 'voters', 'sanders', 'clinton']


THE TOP 5 WORDS FOR TOPIC #5
['people', 'think', 'just', 'music', 'like']


THE TOP 5 WORDS FOR TOPIC #6
['student', 'education', 'schools', 'school', 'students']




In [35]:
topic_results = nmf_model.transform(dtm)
npr['Topic'] = topic_results.argmax(axis=1)
npr.head(10)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6
5,I did not want to join yoga class. I hated tho...,5
6,With a who has publicly supported the debunk...,0
7,"I was standing by the airport exit, debating w...",0
8,"If movies were trying to be more realistic, pe...",0
9,"Eighteen years ago, on New Year’s Eve, David F...",5
