In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r'D:/Data/NLP/TextFiles/npr.csv')

In [3]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


----

# Preprocessing

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
tfidf = TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [8]:
dtm = tfidf.fit_transform(df['Article'])

In [9]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

----

# Non Negative Matrix Factorization

In [10]:
from sklearn.decomposition import NMF

In [11]:
nmf = NMF(n_components=7,random_state=42)

In [12]:
nmf.fit(dtm)



NMF(n_components=7, random_state=42)

----

# Displaying Topics

In [14]:
type(tfidf.get_feature_names())

list

In [15]:
len(tfidf.get_feature_names_out())

54777

In [19]:
tfidf.get_feature_names_out()[2]

'00000'

In [16]:
import random
for i in range(10):
    random1 = random.randint(0,54777)
    print(tfidf.get_feature_names_out()[random1])

alt_uscis
wrath
beaubien
dan
arlie
premiums
roshani
wonderful
lighted
liveliest


In [24]:
nmf.components_

array([[0.00000000e+00, 2.49950821e-01, 0.00000000e+00, ...,
        1.70313822e-03, 2.37544362e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 8.22048918e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 3.12379960e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.89723338e-03, 0.00000000e+00, 1.50186440e-03, ...,
        7.06428924e-04, 5.85500542e-04, 6.89536542e-04],
       [4.01763234e-03, 5.31643833e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [23]:
len(nmf.components_)

7

In [25]:
len(nmf.components_[0])

54777

In [27]:
for index,topic in enumerate(nmf.components_):
    print(f'Top 20 words in Topic:{index}')
    word = []
    for i in topic.argsort()[-20:]:
        word.append(tfidf.get_feature_names_out()[i])
    print(word)
    print('\n')

Top 20 words in Topic:0
['years', 'brain', 'university', 'researchers', 'scientists', 'new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


Top 20 words in Topic:1
['intelligence', 'office', 'nominee', 'republicans', 'comey', 'gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


Top 20 words in Topic:2
['insurers', 'federal', 'said', 'aca', 'repeal', 'senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


Top 20 words in Topic:3
['killed', 'reported', 'military', 'justice', 'city', 'officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


Top 20 words in Topic:4
['candidate', 'said', 'win',

In [40]:
topic_dict = {0:'Medical Research',1:'Politics',2:'Government Schemes',3:'National Security',4:'Elections',5:'Music and love',6:'Education'}

----

# Attaching discoverd labels to original Articles

In [29]:
topic_results = nmf.transform(dtm)

In [30]:
topic_results.shape

(11992, 7)

In [31]:
type(topic_results)

numpy.ndarray

In [32]:
topic_results[0]

array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
       0.        , 0.        ])

In [34]:
topic_results[0].argmax()

1

In [35]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [36]:
df['Topic'] = topic_results.argmax(axis=1)

In [37]:
df.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [41]:
df['label'] = df['Topic'].map(topic_dict)

In [43]:
df.head(50)

Unnamed: 0,Article,Topic,label
0,"In the Washington of 2016, even when the polic...",1,Politics
1,Donald Trump has used Twitter — his prefe...,1,Politics
2,Donald Trump is unabashedly praising Russian...,1,Politics
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,National Security
4,"From photography, illustration and video, to d...",6,Education
5,I did not want to join yoga class. I hated tho...,5,Music and love
6,With a who has publicly supported the debunk...,0,Medical Research
7,"I was standing by the airport exit, debating w...",0,Medical Research
8,"If movies were trying to be more realistic, pe...",0,Medical Research
9,"Eighteen years ago, on New Year’s Eve, David F...",5,Music and love
