In [None]:
"""

Non-negative Matrix Factorization is an unsupervised algorithm that
simultaneously performs dimenstionality reduction and clustering.

we can use it in conjuction with TF-IDF to model topics across
documents.


General mathematic behind non-negative matrix factorization or NMF:

Given a non-negative matrix A, which is what we are going to be able to
create using TF-IDF.
And then we want to find a k-dimention approximation in
terms of non-negative factors W and H.

So we are gonna have matrix A, which n*m some rows features and columns
or objects.

And we wanna perform factorization to essentially approximate A with the
matrix multiplication of W and H, where we have N by K for W

A(n*m: Data Matrix:- Rows=Features, Cols=Objects)-->
W(n*k: Basis Vectors :- Rows=Features) * H(k*m: Coefficient Matrix:Cols =objects)

Approximate each object(i.e. column of A) by a linear combination of k
reduced dimetions or "bais vectors" in W.

Each basis vector can be interpreted as a cluster. The memberships of
objects in these clusters encoded by H.

Input:- Non-negative data matrix(A), number of basis vectors(k), initial
values of factors W and H (e.g random matrices)

Objective Function:- Some measure of reconstruction error between A
and the approximation WH.

Expectation-maximization optimisation to refine W and H in order to
minimise the objective function. Common approach is to itertate between
two multiplicative update rules until convergence approximation of WH
is going to make sense for A

1.) Construct vector space model for documents(after stopword filtering),
resulting in a term document matrix A.

2.) Apply TF-IDF term weight to normalisation to A.

3.) Normalize TF-IDF vetros to unit length.

4.) Initialise factors using NNDSVD on A.
NNDSVD:- Non-negative double single singular value decomposition


5.) Apply Projected Gradient NMF to A.

We end up discovering:-

Basis vectors:- The topics(clusters) in the data.

Coefficient matrix:- The membership weights for documents relative to
each topic(cluster)

Just like LDA, we will need to select the number of expected topics
beforehand (the value of k)!

Also just like with LDA, we will have to interpret the topics based off
the coefficient values of the words per topic.


"""



In [1]:
import pandas as pd

npr=pd.read_csv("/content/npr.csv")

In [3]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tfidf=TfidfVectorizer(max_df=0.95,min_df=2,stop_words="english")
tfidf

In [12]:
dtm=tfidf.fit_transform(npr['Article'])
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.decomposition import NMF

In [14]:
nmf_model=NMF(n_components=7,random_state=42)
nmf_model

In [15]:
nmf_model.fit(dtm)

In [None]:
# display topics


In [16]:
tfidf.get_feature_names_out()[2300]

'albala'

In [20]:
for index,topic in enumerate(nmf_model.components_):
    print(f"The top 15 words for the topic #{index}")
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print("\n")



The top 15 words for the topic #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


The top 15 words for the topic #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


The top 15 words for the topic #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


The top 15 words for the topic #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


The top 15 words for the topic #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


The top 15 words for the topic #5
[

In [None]:
# attach topic labels to articles

In [21]:
topic_results=nmf_model.transform(dtm)
topic_results

array([[0.        , 0.12079653, 0.00139891, ..., 0.01519226, 0.        ,
        0.        ],
       [0.00597968, 0.12635872, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.14152107, 0.        , ..., 0.02266   , 0.        ,
        0.        ],
       ...,
       [0.03171102, 0.        , 0.00840819, ..., 0.00373081, 0.02436518,
        0.        ],
       [0.        , 0.03797827, 0.01069385, ..., 0.12672449, 0.01175785,
        0.00100258],
       [0.02163064, 0.00645752, 0.00070956, ..., 0.01240127, 0.01279309,
        0.00154379]])

In [26]:
topic_results[0].argmax()

1

In [27]:
npr['Topic']=topic_results.argmax(axis=1)
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [29]:
topic_dict={0:'Health',1:'Campaign',2:'legislation',3:'Politics',\
            4:'election',5:'music',6:'education'}

npr['Topic Label']=npr['Topic'].map(topic_dict)
npr.head(15)

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,Campaign
1,Donald Trump has used Twitter — his prefe...,1,Campaign
2,Donald Trump is unabashedly praising Russian...,1,Campaign
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,Politics
4,"From photography, illustration and video, to d...",6,education
5,I did not want to join yoga class. I hated tho...,5,music
6,With a who has publicly supported the debunk...,0,Health
7,"I was standing by the airport exit, debating w...",0,Health
8,"If movies were trying to be more realistic, pe...",0,Health
9,"Eighteen years ago, on New Year’s Eve, David F...",5,music


In [30]:
npr['Topic Label'].value_counts()

Unnamed: 0_level_0,count
Topic Label,Unnamed: 1_level_1
music,3579
Politics,2808
Health,2433
Campaign,1295
election,641
education,635
legislation,601
