## Latent Dirichlet Allocation

In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv("D:/NLP/UPDATED_NLP_COURSE/05-Topic-Modeling/npr.csv")
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [3]:
npr.shape

(11992, 1)

In [5]:
npr.drop_duplicates(inplace = True)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
cv = CountVectorizer(max_df=0.9,min_df=2,stop_words='english')

In [8]:
dtm = cv.fit_transform(npr['Article'])

In [9]:
dtm

<11991x54776 sparse matrix of type '<class 'numpy.int64'>'
	with 3033206 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

In [11]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42)

In [12]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

### Grab the vocabulary of words

In [13]:
len(cv.get_feature_names())

54776

In [14]:
type(cv.get_feature_names())

list

In [18]:
import random

random_word_id = random.randint(0,54776)

cv.get_feature_names()[random_word_id]

'glycemic'

### Grab the topic

In [19]:
len(LDA.components_)

7

In [20]:
type(LDA.components_)

numpy.ndarray

In [21]:
single_topic = LDA.components_[0]

In [22]:
single_topic.argsort() ## gives index position for high probablity words in sorted manner

array([ 2475, 18302, 44966, ..., 36282, 42992, 42560], dtype=int64)

In [23]:
single_topic.argsort()[-10:] ## take index position of top ten words with highest probability

array([18349, 33390,  8149, 46580, 36309, 50425, 22673, 36282, 42992,
       42560], dtype=int64)

In [27]:
top_twenty_words = single_topic.argsort()[-20:]

In [28]:
for index in top_twenty_words:
    print(cv.get_feature_names()[index])

years
insurance
tax
million
government
money
states
law
year
president
federal
new
care
state
percent
trump
health
people
says
said


### Grab the highest probablitiy of words per topic

In [40]:
for i,topic in enumerate(LDA.components_):
    print(f"The top 15 words for topic # {i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')
    print('\n')

The top 15 words for topic # 0
['money', 'states', 'law', 'year', 'president', 'federal', 'new', 'care', 'state', 'percent', 'trump', 'health', 'people', 'says', 'said']




The top 15 words for topic # 1
['ve', 'don', 'years', 'music', 'life', 'new', 'way', 'really', 'know', 'think', 'time', 'says', 'people', 'just', 'like']




The top 15 words for topic # 2
['military', 'time', 'new', 'china', 'north', 'country', 'government', 'just', 'war', 'water', 'years', 'city', 'said', 'people', 'says']




The top 15 words for topic # 3
['way', 'year', 'women', 'people', 'years', 'time', 'make', 'team', 'world', 'company', 'just', 'new', 'like', 'food', 'says']




The top 15 words for topic # 4
['democrats', 'presidential', 'election', 'democratic', 'new', 'vote', 'state', 'sanders', 'republican', 'said', 'party', 'voters', 'campaign', 'clinton', 'trump']




The top 15 words for topic # 5
['students', 'research', 'percent', 'time', 'new', 'university', 'don', 'children', 'school', 'just', '

In [41]:
topic_results = LDA.transform(dtm)

In [42]:
topic_results.shape ## gives probablity of a document belonging to a particular topic

(11991, 7)

In [43]:
topic_results[0].argmax() ## returns index of highest probablity

6

In [44]:
npr['Topic'] = topic_results.argmax(axis=1)

In [45]:
npr

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",6
1,Donald Trump has used Twitter — his prefe...,6
2,Donald Trump is unabashedly praising Russian...,6
3,"Updated at 2:50 p. m. ET, Russian President Vl...",6
4,"From photography, illustration and video, to d...",0
5,I did not want to join yoga class. I hated tho...,5
6,With a who has publicly supported the debunk...,5
7,"I was standing by the airport exit, debating w...",2
8,"If movies were trying to be more realistic, pe...",3
9,"Eighteen years ago, on New Year’s Eve, David F...",1
