In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
tfidf = TfidfVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
dtm = tfidf.fit_transform(df['text'])

In [7]:
nmf_model = NMF(n_components = 7, random_state = 42)
nmf_model.fit(dtm)

In [8]:
tfidf.get_feature_names_out()[10]

'028'

In [8]:
for index, topic in enumerate(nmf_model.components_):
  print(f'THE TOP 15 WORDS FOR TOPIC # {index}')
  word_list = [tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]]
  word_list.reverse()
  print(word_list)
  print('\n')

THE TOP 15 WORDS FOR TOPIC # 0
['mobile', 'people', 'music', 'said', 'digital', 'technology', 'phone', 'users', 'broadband', 'software', 'net', 'phones', 'microsoft', 'tv', 'video']


THE TOP 15 WORDS FOR TOPIC # 1
['mr', 'labour', 'blair', 'election', 'brown', 'party', 'said', 'government', 'howard', 'minister', 'chancellor', 'prime', 'tax', 'tory', 'tories']


THE TOP 15 WORDS FOR TOPIC # 2
['england', 'game', 'win', 'wales', 'ireland', 'cup', 'said', 'team', 'play', 'players', 'rugby', 'match', 'france', 'injury', 'coach']


THE TOP 15 WORDS FOR TOPIC # 3
['film', 'best', 'awards', 'award', 'actor', 'oscar', 'actress', 'festival', 'won', 'films', 'director', 'aviator', 'comedy', 'star', 'year']


THE TOP 15 WORDS FOR TOPIC # 4
['growth', 'economy', 'said', 'bank', 'year', 'sales', 'economic', 'oil', 'prices', '2004', 'market', 'china', 'rate', 'dollar', 'rates']




In [10]:
topic_results = nmf_model.transform(dtm)

In [14]:
predicted_topics = topic_results.argmax(axis = 1)
df['Predicted_Topic'] = predicted_topics
df.head(3)

Unnamed: 0,text,labels,Predicted_Topic
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,4
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,4
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,5
