In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
text_data= fetch_20newsgroups(remove=('headers', 'footers', 'quotes')).data
text_data[:3]

['I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 "A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't an

In [6]:
vectorizer = TfidfVectorizer(max_features=1500, min_df=10, stop_words='english')
X = vectorizer.fit_transform(text_data)
words = np.array(vectorizer.get_feature_names_out())

# print(X[:10])
# print("X = ", words[:10])

In [7]:
nmf = NMF(n_components=10, solver="mu")
W = nmf.fit_transform(X)
H = nmf.components_

for i, topic in enumerate(H):
     print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in words[topic.argsort()[-10:]]])))

Topic 1: want,really,time,ve,good,know,think,like,just,don
Topic 2: help,anybody,info,looking,hi,mail,advance,know,does,thanks
Topic 3: does,church,christians,christian,faith,christ,believe,bible,jesus,god
Topic 4: league,win,hockey,play,players,season,games,year,team,game
Topic 5: bus,floppy,ide,controller,hard,drives,disk,card,scsi,drive
Topic 6: shipping,condition,car,offer,price,space,10,sale,00,new
Topic 7: running,problem,using,program,use,window,files,dos,file,windows
Topic 8: public,algorithm,escrow,government,use,keys,clipper,encryption,chip,key
Topic 9: rights,said,armenians,state,armenian,jews,israeli,government,israel,people
Topic 10: send,internet,email,ftp,article,university,com,cs,soon,edu


In [11]:
# print(W[:10,:10])
# print(H[:10,:10])

In [14]:
import pandas as pd

components_df = pd.DataFrame(nmf.components_, columns=words)
components_df

Unnamed: 0,00,000,01,02,03,04,05,06,0d,0t,...,x11r5,xt,xterm,yeah,year,years,yes,york,young,zip
0,2.134587e-26,4.188077e-05,6.775115e-10,0.003067528,2.467458e-11,9.405396e-12,4.085501e-14,2.065617e-09,5.083868e-99,2.263419e-74,...,1.568053e-09,1.031935e-08,8.731853e-10,0.1415235,0.07469806,0.2334221,0.217491,0.0006450032,0.04859644,1.06348e-11
1,4.292733e-19,3.040342e-15,9.551773e-16,1.813921e-14,3.725023e-05,4.376513e-08,2.17675e-06,3.054217e-15,5.745337e-50,8.279404e-63,...,0.04661155,0.009784276,0.0149669,1.252511e-15,6.7485e-13,1.100174e-13,0.004592,9.454957000000001e-17,1.337535e-13,5.307906e-08
2,9.588315e-41,8.771624999999999e-30,1.008068e-27,0.003389731,6.532696e-15,7.613744e-18,3.309564e-19,1.861066e-18,1.227174e-134,3.738976e-136,...,1.3530789999999999e-20,9.445087e-17,1.621236e-15,2.012219e-13,1.906686e-14,0.0003928215,0.045904,2.033563e-16,0.008291426,9.33181e-21
3,1.986359e-16,0.04577216,0.0005434981,0.004779104,0.002370635,1.345074e-07,0.001590749,4.626523e-05,1.39844e-11,7.609624e-11,...,4.934244e-22,3.134273e-17,1.0130710000000001e-29,0.06385568,0.943715,0.1938534,0.034423,0.07698655,0.08551431,2.439009e-21
4,5.249474e-19,0.0001685834,2.97109e-16,0.008733433,1.894837e-10,0.0005163327,0.003314329,1.395884e-22,5.581464e-12,3.421068e-15,...,2.3358939999999998e-21,0.06266928,1.29791e-29,9.366707e-08,0.007160408,2.296145e-10,0.023093,1.294163e-14,3.068449e-23,1.11952e-10
5,0.9325127,0.2667136,0.07199053,0.04653977,0.03181757,0.06324918,0.03955387,0.03683774,0.004069863,0.00222935,...,1.649888e-07,0.01281476,6.04737e-10,3.153844e-15,0.15737,0.2149765,0.006984,0.1134869,0.01393578,5.474634e-16
6,1.493884e-13,5.59917e-18,0.01703801,0.003132047,0.01548425,5.773113e-05,0.006188977,0.00498626,8.285692e-18,3.350643e-22,...,0.1020315,0.05304726,0.111684,0.000101306,1.68477e-07,2.811234e-13,0.001753,2.055867e-14,3.185568e-13,0.1510158
7,6.962656e-33,3.23327e-18,5.724469e-10,0.005437945,2.83405e-20,0.003726976,1.754928e-17,5.055192e-15,1.403088e-86,3.354273e-64,...,2.16387e-09,0.005480651,0.02348755,1.309414e-09,2.460304e-08,0.001656487,0.042007,3.387719e-14,7.508872e-15,8.798462e-29
8,4.631919e-32,0.1517924,2.612769e-15,0.01249387,5.516315e-18,8.020273e-08,1.9485390000000001e-22,8.890479e-10,5.3031290000000004e-68,2.429026e-61,...,3.711429e-07,2.238732e-24,1.713938e-28,0.001781317,0.07141517,0.1699989,0.078591,0.04506567,0.05479044,1.496713e-14
9,4.965321e-27,6.526066e-17,0.01450665,0.01585501,0.02193709,0.01961167,0.009876432,0.006335053,6.297386e-44,1.173985e-53,...,0.0008860999,0.008540512,0.0004065899,0.002950451,6.759583e-06,9.45063e-07,0.015906,8.950179e-11,0.005172997,0.1167912
