# Setup:

In [1]:
import pandas as pd
npr = pd.read_csv('npr.csv')
npr.head()
from sklearn.feature_extraction.text import CountVectorizer

# discard words that appear in 90% of the document
# only include the word if it appears in at least 2 documents
# use 'english' stop_words
cv = CountVectorizer(max_df=0.9,min_df=2,stop_words='english')

dtm = cv.fit_transform(npr['Article'])

# Step 1: Grab the vocab words & Step 2: Grab the topics

In [2]:
from sklearn.decomposition import LatentDirichletAllocation
# Use n_components as the number of topics to be identified
LDA = LatentDirichletAllocation(n_components=7,random_state=42)
# This may take a while
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

#### Words:

In [3]:
# this holds an instance of every word in each document
cv.get_feature_names()[:10]

['00', '000', '00000', '000s', '000th', '002', '004', '007', '009', '00s']

# Step 2: Grab the topics

In [4]:
# This holds the topics:
display(LDA.components_)

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

#### Combine Step 1 and 2:
> See topics and their top words (not necessary for computation, but helpful for dictating subjects)

In [5]:
for i,topic in enumerate(LDA.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]][::-1])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['says', 'said', 'health', 'people', 'care', 'million', 'company', 'government', 'percent', 'new', '000', 'federal', 'year', 'money', 'companies']


THE TOP 15 WORDS FOR TOPIC #1
['said', 'trump', 'president', 'police', 'told', 'people', 'news', 'says', 'reports', 'npr', 'government', 'russia', 'security', 'house', 'military']


THE TOP 15 WORDS FOR TOPIC #2
['says', 'like', 'people', 'just', 'food', 'years', 'new', 'city', 'water', 'time', 'day', 'home', 'family', 'world', 'way']


THE TOP 15 WORDS FOR TOPIC #3
['says', 'people', 'health', 'women', 'like', 'study', 'children', 'just', 'patients', 'disease', 'medical', 'years', 'don', 'new', 'time']


THE TOP 15 WORDS FOR TOPIC #4
['trump', 'said', 'clinton', 'president', 'state', 'people', 'campaign', 'republican', 'court', 'obama', 'new', 'party', 'election', 'vote', 'voters']


THE TOP 15 WORDS FOR TOPIC #5
['like', 'just', 'people', 'think', 'know', 'time', 'really', 'music', 'way', 'new', 'don', 'life

# Step 3: Grab the highest probability words per topic

In [6]:
topic_results = LDA.transform(dtm)

# Finalize: Assign the classes/topics to the observations

In [7]:
npr['Topic'] = topic_results.argmax(axis=1)
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
