In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
X_train, X_hold = train_test_split(df, test_size = 0.3, random_state = 111)

In [9]:
# stemmer = PorterStemmer()
stemmer = nltk.stem.SnowballStemmer('english')
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

def tokenize(text):
  tokens = [word for word in nltk.word_tokenize(text) if (len(word) > 3 and len(word.strip('Xx/')) > 2 and len(re.sub('\d+', '', word.strip('Xx/'))) > 3)]
  tokens = map(str.lower, tokens)
  stems = [stemmer.stem(item) for item in tokens if (item not in stop_words)]
  return stems

vectorizer = TfidfVectorizer(tokenizer = tokenize, stop_words = None, max_df = 0.75, max_features = 1000, lowercase = False, ngram_range = (1,2))
tfidf_vectors = vectorizer.fit_transform(X_train.text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
vectorizer.get_feature_names_out()[:10]

array(['abl', 'accept', 'access', 'accord', 'account', 'accus', 'achiev',
       'across', 'act', 'action'], dtype=object)

In [13]:
clf = decomposition.NMF(n_components = 6, random_state = 111)

W1 = clf.fit_transform(tfidf_vectors)
H1 = clf.components_

In [18]:
W1.shape

(1557, 6)

In [19]:
H1.shape

(6, 1000)

In [22]:
num_words = 15

vocab = np.array(vectorizer.get_feature_names_out())

top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H1])
topics = [' '.join(t) for t in topic_words]

topics

['compani firm share deal yuko would execut profit busi group sale market court financi airlin',
 'film award best star nomin oscar actor actress director movi includ festiv year comedi music',
 'game play england player match wale injuri team ireland first side coach second franc time',
 'labour elect blair parti tori brown minist govern would howard prime prime minist plan chancellor campaign',
 'mobil phone technolog peopl music user use comput servic softwar game digit network microsoft devic',
 'economi growth rate price bank econom year rise market quarter dollar figur fall china sale']

In [25]:
colnames = ['Topic' + str(i) for i in range(clf.n_components)]
docnames = ['Doc' + str(i) for i in range(len(X_train.text))]
df_doc_topic = pd.DataFrame(np.round(W1, 2), columns = colnames, index = docnames)
significant_topic = np.argmax(df_doc_topic.values, axis = 1)
df_doc_topic['dominant_topic'] = significant_topic
df_doc_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,dominant_topic
Doc0,0.01,0.00,0.00,0.0,0.00,0.13,5
Doc1,0.02,0.00,0.00,0.0,0.15,0.00,4
Doc2,0.03,0.01,0.00,0.0,0.00,0.10,5
Doc3,0.02,0.00,0.00,0.0,0.12,0.01,4
Doc4,0.04,0.00,0.01,0.0,0.00,0.12,5
...,...,...,...,...,...,...,...
Doc1552,0.00,0.00,0.00,0.0,0.00,0.30,5
Doc1553,0.01,0.08,0.04,0.0,0.03,0.02,1
Doc1554,0.10,0.00,0.02,0.0,0.00,0.00,0
Doc1555,0.00,0.20,0.01,0.0,0.00,0.03,1


In [27]:
WHold = clf.transform(vectorizer.transform(X_hold.text[:5]))

colnames = ['Topic' + str(i) for i in range(clf.n_components)]
docnames = ['Doc' + str(i) for i in range(len(X_hold[:5].text))]
df_doc_topic = pd.DataFrame(np.round(WHold, 2), columns = colnames, index = docnames)
significant_topic = np.argmax(df_doc_topic.values, axis = 1)
df_doc_topic['dominant_topic'] = significant_topic
df_doc_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,dominant_topic
Doc0,0.02,0.02,0.0,0.01,0.04,0.08,5
Doc1,0.0,0.05,0.03,0.0,0.09,0.01,4
Doc2,0.0,0.0,0.01,0.12,0.01,0.0,3
Doc3,0.01,0.01,0.01,0.07,0.01,0.11,5
Doc4,0.02,0.0,0.0,0.0,0.0,0.23,5
