In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import interactive
interactive(True)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import plotly.express as px
from sklearn.cluster import KMeans
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode(connected=True)
from sklearn.metrics.cluster import homogeneity_score
import nltk
from nltk.corpus import stopwords
from sklearn.decomposition import NMF
np.random.seed(0)

In [2]:
from sklearn.datasets import fetch_20newsgroups

#The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics


In [3]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42,categories=categories)

In [4]:
data_class = data.target
data_text  = data.data

In [5]:
count_vect = CountVectorizer(stop_words="english")
text_vectors = count_vect.fit_transform(data_text)
X_transform = TSNE(n_components=3).fit_transform(text_vectors.toarray())

In [6]:
text_vectors.shape

(2034, 33814)

# Original data

In [7]:
d = {'x':X_transform[:,0],'y':X_transform[:,1], 'z': X_transform[:,2],'Score':data_class}
df = pd.DataFrame(d)

In [8]:
fig = px.scatter_3d(df, x="x", y="y", z="z",color="Score")
fig.show()

# KMeans

In [9]:
est = KMeans(n_clusters=4)
est.fit(text_vectors)
labels = est.labels_

In [10]:
d = {'x':X_transform[:,0],'y':X_transform[:,1], 'z': X_transform[:,2],'Score':labels}
df = pd.DataFrame(d)

In [11]:
homogeneity_score(data_class,labels)

0.005854734538601563

In [12]:
fig = px.scatter_3d(df, x="x", y="y", z="z",color="Score")
fig.show()

# Non-negative matrix factorization 

In [13]:
text_vectors_T = text_vectors.T

In [14]:
text_vectors_T.shape

(33814, 2034)

In [15]:
model = NMF(n_components=4, init='random', random_state=0,beta_loss="kullback-leibler",solver="mu")
W = model.fit_transform(text_vectors_T)
H = model.components_

In [16]:
W.shape

(33814, 4)

In [17]:
H.shape

(4, 2034)

In [18]:
l,c = H.shape
l,c

(4, 2034)

In [22]:
labels_NMF = []
for index in range(c):

    col = H[:,index]
    
    cluster = np.argmax(col)
    
    labels_NMF.append(cluster)
    

In [23]:
d = {'x':X_transform[:,0],'y':X_transform[:,1], 'z': X_transform[:,2],'Score':labels_NMF}
df = pd.DataFrame(d)

In [24]:
homogeneity_score(data_class,labels_NMF)

0.47631825798603405

In [25]:
fig = px.scatter_3d(df, x="x", y="y", z="z",color="Score")
fig.show()

In [27]:
from sklearn import preprocessing
X_norm = preprocessing.normalize(text_vectors_T, norm='l2',axis=1)

In [53]:
def NMF(X, p, e, repetitions):
    n = X.shape[0]
    m = X.shape[1]
    U = np.random.rand(n, p)
    V = np.random.rand(p, m)
    err = 1000
    alpha = 0.02
    delta = 0.00001
    c = 0
    while(err > e and c<repetitions):
        # Update U
        XV_T = X @ V.T
        UVV_T =  (U @ V @ V.T) + delta

        #W = W * AH_T / WHH_T
        for i in range(np.size(U, 0)):
            for j in range(np.size(U, 1)):
                U[i, j] = U[i, j] * XV_T[i, j] / UVV_T[i, j]
               
        # Update V
        X_TU = X.T @ U
        V_TU_TU = (V.T @ U.T @ U) + delta
        #H = H * W_TA / W_TWH
       
        for i in range(np.size(V, 1)):
            for j in range(np.size(V, 0)):
                try:
                    V.T[i, j] = V.T[i, j] * X_TU[i, j] / V_TU_TU[i, j]
                except:
                    print(V.shape, X_TU.shape, V_TU_TU.shape, i, j)
       
       
        #err = mean_square_error(X, U@V)
        if c % 10 == 0:
            print('Iter ',c)
        c += 1
   
#         for j in range(U.shape[1]):    
#             U[:,j] = U[:,j]/np.sqrt(np.sum(U[:,j]**2))#preprocessing.normalize(U, axis=1)  
#         for j in range(V.T.shape[1]):    
#             V.T[:,j] = V.T[:,j]/np.sqrt(np.sum(U[:,j]**2))
   
    return U, V

In [None]:
W_, H_ = NMF(text_vectors_T, p=4, e=00000000.1, repetitions=2000)

Iter  0
Iter  10
Iter  20
Iter  30
Iter  40
Iter  50
Iter  60
Iter  70
Iter  80
Iter  90
Iter  100
Iter  110
Iter  120
Iter  130
Iter  140
Iter  150
Iter  160
Iter  170
Iter  180
Iter  190
Iter  200
Iter  210
Iter  220
Iter  230
Iter  240
Iter  250
Iter  260
Iter  270
Iter  280
Iter  290
Iter  300
Iter  310
Iter  320
Iter  330
Iter  340
Iter  350
Iter  360
Iter  370
Iter  380
Iter  390
Iter  400
Iter  410
Iter  420
Iter  430
Iter  440
Iter  450
Iter  460
Iter  470
Iter  480
Iter  490
Iter  500
Iter  510
Iter  520
Iter  530
Iter  540
Iter  550
Iter  560
Iter  570
Iter  580
Iter  590
Iter  600
Iter  610
Iter  620
Iter  630
Iter  640
Iter  650
Iter  660
Iter  670
Iter  680
Iter  690
Iter  700
Iter  710
Iter  720
Iter  730
Iter  740


In [None]:
labels_NMF_ = []
for index in range(H_.shape[1]):

    col = H_[:,index]
    
    cluster = np.argmax(col)
    
    labels_NMF_.append(cluster)


In [None]:
homogeneity_score(data_class,labels_NMF_)