In [23]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn.decomposition import LatentDirichletAllocation

In [24]:
npr = pd.read_csv('npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [25]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') ## max_df: Maximum Document Frequency
dtm = cv.fit_transform(npr['Article'])

In [26]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42)
LDA.fit(dtm)

LatentDirichletAllocation(n_components=7, random_state=42)

In [27]:
## Number Of (Unique) Words In All Articles.
len(cv.get_feature_names())

54777

In [28]:
print("-- Random 10 Words --")
for i in range(10):
    random_word_id = random.randint(0,54776)
    print(cv.get_feature_names()[random_word_id])

-- Random 10 Words --
policyholders
preparedness
squandering
extortion
regionally
columbus
lap
penzance
barbed
470


In [29]:
print(len(LDA.components_), '\n')    ## No. of Components
print(LDA.components_, '\n')         ## The Components
print(len(LDA.components_[0]), '\n') ## Len of First Component
print(LDA.components_[0], '\n')      ## First Component 

7 

[[8.64332806e+00 2.38014333e+03 1.42900522e-01 ... 1.43006821e-01
  1.42902042e-01 1.42861626e-01]
 [2.76191749e+01 5.36394437e+02 1.42857148e-01 ... 1.42861973e-01
  1.42857147e-01 1.42906875e-01]
 [7.22783888e+00 8.24033986e+02 1.42857148e-01 ... 6.14236247e+00
  2.14061364e+00 1.42923753e-01]
 ...
 [3.11488651e+00 3.50409655e+02 1.42857147e-01 ... 1.42859912e-01
  1.42857146e-01 1.42866614e-01]
 [4.61486388e+01 5.14408600e+01 3.14281373e+00 ... 1.43107628e-01
  1.43902481e-01 2.14271779e+00]
 [4.93991422e-01 4.18841042e+02 1.42857151e-01 ... 1.42857146e-01
  1.43760101e-01 1.42866201e-01]] 

54777 

[8.64332806e+00 2.38014333e+03 1.42900522e-01 ... 1.43006821e-01
 1.42902042e-01 1.42861626e-01] 



In [30]:
single_topic = LDA.components_[0]
single_topic.argsort()

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993], dtype=int64)

In [20]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]], '\n')

THE TOP 15 WORDS FOR TOPIC #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says'] 

THE TOP 15 WORDS FOR TOPIC #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said'] 

THE TOP 15 WORDS FOR TOPIC #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says'] 

THE TOP 15 WORDS FOR TOPIC #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says'] 

THE TOP 15 WORDS FOR TOPIC #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump'] 

THE TOP 15 WORDS FOR TOPIC #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think',

In [31]:
topic_results = LDA.transform(dtm)
topic_results.shape
topic_results[0].round(2)
npr['Topic'] = topic_results.argmax(axis=1)
npr.head(10)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
5,I did not want to join yoga class. I hated tho...,3
6,With a who has publicly supported the debunk...,3
7,"I was standing by the airport exit, debating w...",2
8,"If movies were trying to be more realistic, pe...",3
9,"Eighteen years ago, on New Year’s Eve, David F...",2
