In [18]:
# Non-Negative Matrix Factorization

# Non-Negative Matrix factorization is an unsupervised learning algorithm that simultaneously 
# performs dimensionality reduction and clustering. 

# We can use this technique in conjunction with TF-IDF to model topics across documents.

# We are given a non-negative matrix A, find k-dimension approximation in terms of the 
# non-negative factors W and H.


# Basis Vectors ==> W 
# Coefficient Matrix ==> H

# n * m (data matrix, rows = features, cols = objects) ===> n * k (W, Basis Vectors, rows = features)  k * m (H, Coefficient Matrix, cols = objects)
# Note: W >= 0 and H >= 0

# Approximate each object (column of A) by a linear combination of k reduced dimensions or "basis vectors" in W.

# Each basis vector can be interpreted as a cluster. The memberships of objects in these clusters are encoded by H.


# Input: Non-negative data matrix (A), number of basis vectors (k), and initial values for 
# the factors W and H (e.g. random matrices). In topic modelling, The 'k' is the number of 
# topics we choose. Here, the 'A' is the TF-IDF for the words across all the documents.

# Objective function: Some measure of reconstruction error between A and the approximation WH.

# Expectation-maximization optimization to refine W and H in order to minimize the objective function.
# Common approach is to iterate between two multiplicative update rules until convergence.

# Steps: 

# 1-) Construct a vector space model for the documents (after stopword filtering), which results in a 
# term document matrix A.

# 2-) Apply TF-IDF term weight normalization to A.

# 3-) Normalize TF-IDF vectors to unit length.

# 4-) Initialize the factors using non-negative double singular value decomposition (NNDSVD) on A.

# 5-) Apply a projected gradient non-negative matrix factorization to A.

# * Basis Vectors: The topics (clusters) in the data.
# * Coefficient Matrix: The membership weights for documents relative to each topic (cluster).


#---------------------------------------------------------------------------------------------------#

# * Create a document term matrix with TF-IDF vectorization.
# * Resulting W and H.
# Basis vectors W = Topics (Clusters)
# Coefficients H = Memberships for documents


# Important Notes: 
# Just like LDA, we will need to select the number of expected topics beforehand (the value of k)!
# Moreover, just like LDA, we will have to interpret the topics based on the coefficient values
# of the words per topic.
# Coefficient value is not a probability value like the LDA gives us.


"""Comparison between Latent Dirichlet allocation (LDA) and Non-negative Matrix Factorization (NMF) –

Latent Dirichlet allocation (LDA)

* Assumes each document has multiple topics.
* Works best with longer texts such as full articles, essays, and books.
* Evolves as you process new documents with the same model.
* Results are not deterministic, meaning you might get different results each time for the same data set.

Non-negative Matrix Factorization(NMF)

* Calculates how well each document fits each topic, rather than assuming a document has multiple topics.
* Usually faster than LDA.
* Works best with shorter texts such as tweets or titles.
* Results are almost deterministic, having more consistency when running the same data."""

In [19]:
import pandas as pd

npr = pd.read_csv('npr.csv')
print(npr)
print()
print(type(npr))

                                                 Article
0      In the Washington of 2016, even when the polic...
1        Donald Trump has used Twitter  —   his prefe...
2        Donald Trump is unabashedly praising Russian...
3      Updated at 2:50 p. m. ET, Russian President Vl...
4      From photography, illustration and video, to d...
...                                                  ...
11987  The number of law enforcement officers shot an...
11988    Trump is busy these days with victory tours,...
11989  It’s always interesting for the Goats and Soda...
11990  The election of Donald Trump was a surprise to...
11991  Voters in the English city of Sunderland did s...

[11992 rows x 1 columns]

<class 'pandas.core.frame.DataFrame'>


In [20]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [21]:
npr.tail()

Unnamed: 0,Article
11987,The number of law enforcement officers shot an...
11988,"Trump is busy these days with victory tours,..."
11989,It’s always interesting for the Goats and Soda...
11990,The election of Donald Trump was a surprise to...
11991,Voters in the English city of Sunderland did s...


In [22]:
print(npr.head())
print()
print()
print(npr.tail())

                                             Article
0  In the Washington of 2016, even when the polic...
1    Donald Trump has used Twitter  —   his prefe...
2    Donald Trump is unabashedly praising Russian...
3  Updated at 2:50 p. m. ET, Russian President Vl...
4  From photography, illustration and video, to d...


                                                 Article
11987  The number of law enforcement officers shot an...
11988    Trump is busy these days with victory tours,...
11989  It’s always interesting for the Goats and Soda...
11990  The election of Donald Trump was a surprise to...
11991  Voters in the English city of Sunderland did s...


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf = TfidfVectorizer(max_df = 0.90, min_df = 2, stop_words = 'english')
print(tfidf)
print(type(tfidf))

TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english')
<class 'sklearn.feature_extraction.text.TfidfVectorizer'>


In [24]:
document_term_matrix = tfidf.fit_transform(npr['Article'])
print(document_term_matrix)
print(type(document_term_matrix))

  (0, 35089)	0.03226897967994957
  (0, 54092)	0.020091155964863487
  (0, 38081)	0.01848530736819011
  (0, 54067)	0.012397163630307462
  (0, 10755)	0.02787343599042425
  (0, 2084)	0.015039386128760875
  (0, 42314)	0.022036401840046918
  (0, 37132)	0.019410859475366766
  (0, 36306)	0.03708208498289024
  (0, 11215)	0.04403771437574997
  (0, 21789)	0.02878450387905149
  (0, 1489)	0.023797160985101373
  (0, 50697)	0.019446980637479037
  (0, 33048)	0.024542771720625026
  (0, 51402)	0.016746597091717476
  (0, 19989)	0.02443930384385255
  (0, 36685)	0.030973769110011082
  (0, 49906)	0.02337784417528088
  (0, 19711)	0.019932202578472266
  (0, 10363)	0.03610629520801115
  (0, 35891)	0.015092411075280365
  (0, 35436)	0.024429052861450844
  (0, 50587)	0.019150868499058813
  (0, 13837)	0.053973383521678145
  (0, 1932)	0.027002284256125594
  :	:
  (11991, 24722)	0.01644172551896015
  (11991, 30961)	0.013423780587151574
  (11991, 39886)	0.014046569134225128
  (11991, 7572)	0.02747146498745652
  (1199

In [25]:
document_term_matrix

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [26]:
# Perform Non-Negative Matrix Factorization 
from sklearn.decomposition import NMF


nmf_model = NMF(n_components = 10, random_state = 42)
nmf_model.fit(document_term_matrix)



NMF(n_components=10, random_state=42)

In [30]:
# As you can see below, the length of the 'feature_names' array is 54777. This means that there are 54777 unique 
# terms or words in the corpus or collection of documents which are used to create the TF-IDF representation. 

feature_names = tfidf.get_feature_names()
print(len(feature_names))

print(f"There are {len(feature_names)} unique terms or words in the corpus used to create the TF-IDF representation.")

print()
print()
print()
print()
for j in range(0, 1000):
    print(feature_names[j])

54777
There are 54777 unique terms or words in the corpus used to create the TF-IDF representation.




00
000
00000
000s
000th
002
004
007
009
00s
01
011
012
015
02
021
024
029
03
032
033
04
042
05
050
054
058
06
060
062
064
065
068
07
075
08
080
088
09
094
098
0_hellofriend
10
100
1000
100th
101
101st
102
103
104
1040
105
105th
106
1066
107
1070
108
109
10k
10s
10th
11
110
111
112
113
113th
114
114th
115
115th
116
117
118
119
11th
12
120
1200
121
122
123
1234
124
125
125th
126
127
128
129
12th
13
130
1300
1300s
131
131st
132
133
134
135
136
137
138
139
13th
14
140
1400s
141
142
143
143rd
144
145
146
147
148
149
1492
1493
14th
15
150
1500
1500s
150th
151
1517
152
1523
153
154
155
156
157
158
159
15girls
15th
16
160
1600
1600s
161
1616
162
1623
163
1631
164
165
166
1662
1669
167
168
169
16mm
16th
17
170
1700s
171
1713
172
173
1734
174
1747
175
176
1761
1762
1766
177
1770
1770s
1774
1776
178
1780s
1783
1784
1787
1788
1789
179
1790
1790s
1791
1793
1796
17th
18
180
1800
1800s
1801
1804
18

In [33]:
for index, topic in enumerate(nmf_model.components_):
    print(f"The top 25 words for topic#{index}: ")
    print()
    top25_words = [tfidf.get_feature_names()[j] for j in topic.argsort()[-25:]]
    print(top25_words)
    print()

The top 25 words for topic#0: 

['lot', 'help', 'time', 'use', 'scientists', 'don', 'make', 'companies', 'work', 'research', 'year', 'university', 'workers', '000', 'years', 'just', 'company', 'study', 'new', 'percent', 'like', 'water', 'food', 'people', 'says']

The top 25 words for topic#1: 

['republicans', 'washington', 'media', 'party', 'office', 'nominee', 'business', 'speech', 'tax', 'news', 'administration', 'cruz', 'election', 'pence', 'gop', 'presidential', 'obama', 'house', 'white', 'republican', 'donald', 'campaign', 'said', 'president', 'trump']

The top 25 words for topic#2: 

['costs', 'services', 'premiums', 'plans', 'said', 'medical', 'insurers', 'house', 'aca', 'percent', 'patients', 'repeal', 'law', 'act', 'republicans', 'tax', 'people', 'plan', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']

The top 25 words for topic#3: 

['islamic', 'iraqi', 'civilians', 'reports', 'security', 'mosul', 'president', 'government', 'attack', 'turkey

In [41]:
# It attaches discovered topic labels to the original articles
topic_results = nmf_model.transform(dtm)
print(topic_results.argmax(axis=1))


# assigning the index of the maximum value for each topic to the numerical topic label in the npr data frame.
# creating a numerical topic label column called 'Topic' in the npr data frame.
npr['Topic'] = topic_results.argmax(axis=1) 


[1 1 1 ... 7 4 0]


In [42]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [43]:
npr.tail()

Unnamed: 0,Article,Topic
11987,The number of law enforcement officers shot an...,8
11988,"Trump is busy these days with victory tours,...",1
11989,It’s always interesting for the Goats and Soda...,7
11990,The election of Donald Trump was a surprise to...,4
11991,Voters in the English city of Sunderland did s...,0


In [44]:
print(npr.head())

                                             Article  Topic
0  In the Washington of 2016, even when the polic...      1
1    Donald Trump has used Twitter  —   his prefe...      1
2    Donald Trump is unabashedly praising Russian...      1
3  Updated at 2:50 p. m. ET, Russian President Vl...      3
4  From photography, illustration and video, to d...      6


In [45]:
print(npr.tail())

                                                 Article  Topic
11987  The number of law enforcement officers shot an...      8
11988    Trump is busy these days with victory tours,...      1
11989  It’s always interesting for the Goats and Soda...      7
11990  The election of Donald Trump was a surprise to...      4
11991  Voters in the English city of Sunderland did s...      0
