In [6]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora, models
from sklearn.cluster import KMeans as sk_kmeans
from nltk.cluster.kmeans import KMeansClusterer as nl_kmeans
from sklearn.cluster import AgglomerativeClustering
from nltk.cluster.util import cosine_distance
import pandas as pd
from sklearn import metrics

In [7]:
file = open('20docs3Dif.txt', 'r')
corpus = [line.strip() for line in file]
file.close()
corpus = [doc.split() for doc in corpus]

In [8]:
categories = ['comp.graphics', 'rec.autos', 'sci.med',  'talk.politics.mideast']
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), categories=categories)
Y = newsgroups.target
dictionary = corpora.Dictionary(corpus)
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

In [5]:
rez = []
for i in range(4, 20):
    lsi_model = models.LsiModel(bow_corpus, id2word=dictionary, num_topics=i)
    coh = models.CoherenceModel(lsi_model, texts=corpus, coherence='c_v')
    rez.append((i,coh.get_coherence()))
rez

[(4, 0.6755414728581919),
 (5, 0.5897825435029906),
 (6, 0.4931694608220445),
 (7, 0.5510627611209156),
 (8, 0.5226453624951066),
 (9, 0.6112757576026088),
 (10, 0.5578878588790583),
 (11, 0.5776261710106447),
 (12, 0.5450498878960249),
 (13, 0.5303916721748796),
 (14, 0.5148667454797353),
 (15, 0.5085542715760716),
 (16, 0.44833471536493885),
 (17, 0.500654128797894),
 (18, 0.48193362078291835),
 (19, 0.4420444225127686)]

In [9]:
lsi_model = models.LsiModel(bow_corpus, id2word=dictionary, num_topics=20)
lsi_topics = lsi_model.print_topics(num_topics=20, num_words=10)
for topic in lsi_topics:
 print(topic)

(0, '0.466*"image" + 0.441*"jpeg" + 0.305*"file" + 0.198*"format" + 0.181*"edu" + 0.178*"gif" + 0.176*"color" + 0.140*"graphic" + 0.133*"program" + 0.126*"version"')
(1, '0.468*"jpeg" + -0.361*"edu" + -0.244*"graphic" + -0.186*"pub" + 0.171*"gif" + -0.168*"data" + -0.140*"com" + -0.135*"mail" + -0.132*"c" + 0.131*"color"')
(2, '-0.319*"people" + -0.177*"armenian" + -0.160*"time" + -0.156*"child" + -0.154*"azerbaijani" + -0.142*"woman" + -0.141*"year" + -0.141*"apartment" + 0.120*"edu" + -0.114*"told"')
(3, '0.256*"image" + 0.184*"hiv" + -0.180*"edu" + 0.173*"data" + 0.173*"health" + -0.162*"graphic" + -0.156*"people" + 0.144*"cancer" + -0.128*"jpeg" + 0.125*"page"')
(4, '-0.393*"image" + 0.199*"jpeg" + 0.195*"hiv" + -0.184*"data" + 0.177*"health" + 0.155*"edu" + 0.149*"cancer" + -0.138*"tool" + 0.127*"patient" + 0.126*"disease"')
(5, '-0.383*"jew" + -0.372*"turkish" + -0.274*"adl" + -0.199*"turkey" + -0.160*"jewish" + -0.149*"bullock" + -0.137*"nazi" + -0.136*"ottoman" + 0.124*"hiv" + 

In [10]:
document_topic_vectors = []
X = np.array([])
for doc_bow in bow_corpus:
    document_topic_vector = lsi_model[doc_bow]
    document_topic_vectors.append(document_topic_vector)

In [13]:
X = np.array([])
for vec in document_topic_vectors:
    a = np.array([])
    if len(vec) < 20:
        a = np.array([1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-10])
    else:    
        for toup in vec:
            a = np.append(a, toup[1])
    X = np.append(X, a)
X = np.reshape(X, (3893,20))

In [79]:
iter = 30
a_rand = np.zeros((2,iter))
v_measure = np.zeros((2,iter))
mutual = np.zeros((2,iter))
fowlkes = np.zeros((2,iter))

In [80]:
for i in range(iter):
    eucl_pred = sk_kmeans(n_clusters=4, init='k-means++', n_init='auto').fit_predict(X)
    a_rand[0][i] = metrics.rand_score(Y, eucl_pred)
    v_measure[0][i] = metrics.v_measure_score(Y,eucl_pred)
    mutual[0][i] = metrics.adjusted_mutual_info_score(Y,eucl_pred)
    fowlkes[0][i] = metrics.fowlkes_mallows_score(Y, eucl_pred)

    nl_clusterer = nl_kmeans(4, distance=cosine_distance, avoid_empty_clusters=True)
    cos_pred = nl_clusterer.cluster(X, assign_clusters=True)
    cos_pred = np.array(cos_pred)
    a_rand[1][i] = metrics.rand_score(Y, cos_pred)
    v_measure[1][i] = metrics.v_measure_score(Y, cos_pred)
    mutual[1][i] = metrics.adjusted_mutual_info_score(Y, cos_pred)
    fowlkes[1][i] = metrics.fowlkes_mallows_score(Y, cos_pred)

In [None]:
a_rand_h = np.zeros((2,4))
v_measure_h = np.zeros((2,4))
mutual_h = np.zeros((2,4))
fowlkes_h = np.zeros((2,4))

In [None]:
linkage = ['ward', 'complete', 'average', 'single']
for i,link in enumerate(linkage):
    hierachical = AgglomerativeClustering(n_clusters=4, linkage=link).fit(X)
    eucl_pred = hierachical.labels_
    a_rand_h[0][i] = metrics.rand_score(Y, eucl_pred)
    v_measure_h[0][i] = metrics.adjusted_rand_score(Y, eucl_pred)
    mutual_h[0][i] = metrics.homogeneity_score(Y, eucl_pred)
    fowlkes_h[0][i] = metrics.completeness_score(Y, eucl_pred)
linkage = ['complete', 'average', 'single']
for i,link in enumerate(linkage):
    hierachical = AgglomerativeClustering(n_clusters=4, linkage=link, metric='cosine').fit(X)
    eucl_pred = hierachical.labels_
    a_rand_h[1][i+1] = metrics.rand_score(Y, eucl_pred)
    v_measure_h[1][i+1] = metrics.adjusted_rand_score(Y, eucl_pred)
    mutual_h[1][i+1] = metrics.homogeneity_score(Y, eucl_pred)
    fowlkes_h[1][i+1] = metrics.completeness_score(Y, eucl_pred)

In [None]:
a_rand = [0, np.min(a_rand[0]), np.mean(a_rand[0]), np.max(a_rand[0]), 0, np.min(a_rand[1]), np.mean(a_rand[1]), np.max(a_rand[1]),
          0, *a_rand_h[0], 0, *a_rand_h[1]]
v_measure = [0, np.min(v_measure[0]), np.mean(v_measure[0]), np.max(v_measure[0]), 0, np.min(v_measure[1]), np.mean(v_measure[1]), np.max(v_measure[1]),
             0, *v_measure_h[0], 0, *v_measure_h[1]] 
mutual = [0, np.min(mutual[0]), np.mean(mutual[0]), np.max(mutual[0]), 0, np.min(mutual[1]), np.mean(mutual[1]), np.max(mutual[1]),
          0, *mutual_h[0], 0, *mutual_h[1]]
fowlkes = [0, np.min(fowlkes[0]), np.mean(fowlkes[0]), np.max(fowlkes[0]), 0, np.min(fowlkes[1]), np.mean(fowlkes[1]), np.max(fowlkes[1]), 
           0, *fowlkes_h[0], 0, *fowlkes_h[1]]
table = pd.DataFrame({'a_rand': a_rand, 'v_measure': v_measure, 'mutual': mutual, 'fowlkes': fowlkes},
                     index=['k-means euclid', 'min', 'avrg', 'max', 'k-means cosine', 'min', 'avrg', 'max', 
                            'hierarchical euclid', 'ward', 'complete', 'average', 'single', 'hierarchical cosine', 'ward', 'complete', 'average', 'single'])
table.to_excel('LSI20news.xlsx', float_format="%.2f")