In [2]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora, models
from sklearn.cluster import KMeans as sk_kmeans
from nltk.cluster.kmeans import KMeansClusterer as nl_kmeans
from sklearn.cluster import AgglomerativeClustering
from nltk.cluster.util import cosine_distance
import pandas as pd
from sklearn import metrics


In [3]:
categories = ['comp.graphics', 'rec.autos', 'sci.med',  'talk.politics.mideast']
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), categories=categories)
Y = newsgroups.target

In [4]:
name = '20docs1Dif.txt'
file = open(name, 'r')
corpus = [line.strip() for line in file]
file.close()
corpus = [doc.split() for doc in corpus]

In [5]:
#tokenized_documents = [simple_preprocess(text) for text in newsgroups.data]
dictionary = corpora.Dictionary(corpus)
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

In [12]:
rez = []
for i in range(4, 20):
    lda_model = models.LdaModel(bow_corpus, num_topics=i, id2word=dictionary, passes=15)
    coh = models.CoherenceModel(lda_model,texts=corpus, coherence='c_v')
    rez.append((i,coh.get_coherence()))
rez


[(4, 0.5489701329423775),
 (5, 0.5461610091086941),
 (6, 0.5937263159349457),
 (7, 0.5540937571223997),
 (8, 0.48244870246686217),
 (9, 0.5424293566345674),
 (10, 0.5873376761165991),
 (11, 0.5391533442406428),
 (12, 0.5153215253455183),
 (13, 0.5544305095214209),
 (14, 0.5455731982181954),
 (15, 0.5233916270482989),
 (16, 0.5065380640880058),
 (17, 0.5270722892847023),
 (18, 0.5319508353487308),
 (19, 0.5214311385356593)]

In [None]:
lda_model = models.LdaModel(bow_corpus, num_topics=4, id2word=dictionary)
topics = lda_model.print_topics(num_words=10)
for topic in topics:
 print(topic)

In [35]:
document_topic_vectors = np.array([])
for doc_bow in bow_corpus:
 document_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
 document_topic_vector = [topic_prob for _, topic_prob in document_topics]
 document_topic_vectors = np.append(document_topic_vectors, document_topic_vector)
X = np.reshape(document_topic_vectors, (3893,6))

In [36]:
iter = 30
a_rand = np.zeros((2,iter))
v_measure = np.zeros((2,iter))
mutual = np.zeros((2,iter))
fowlkes = np.zeros((2,iter))

In [37]:
for i in range(iter):
    eucl_pred = sk_kmeans(n_clusters=4, init='k-means++', n_init='auto').fit_predict(X)
    a_rand[0][i] = metrics.rand_score(Y, eucl_pred)
    v_measure[0][i] = metrics.v_measure_score(Y,eucl_pred)
    mutual[0][i] = metrics.adjusted_mutual_info_score(Y,eucl_pred)
    fowlkes[0][i] = metrics.fowlkes_mallows_score(Y, eucl_pred)

    nl_clusterer = nl_kmeans(4, distance=cosine_distance, avoid_empty_clusters=True)
    cos_pred = nl_clusterer.cluster(X, assign_clusters=True)
    cos_pred = np.array(cos_pred)
    a_rand[1][i] = metrics.rand_score(Y, cos_pred)
    v_measure[1][i] = metrics.v_measure_score(Y, cos_pred)
    mutual[1][i] = metrics.adjusted_mutual_info_score(Y, cos_pred)
    fowlkes[1][i] = metrics.fowlkes_mallows_score(Y, cos_pred)

In [38]:
a_rand_h = np.zeros((2,4))
v_measure_h = np.zeros((2,4))
mutual_h = np.zeros((2,4))
fowlkes_h = np.zeros((2,4))

In [39]:
linkage = ['ward', 'complete', 'average', 'single']
for i,link in enumerate(linkage):
    hierachical = AgglomerativeClustering(n_clusters=4, linkage=link).fit(X)
    eucl_pred = hierachical.labels_
    a_rand_h[0][i] = metrics.rand_score(Y, eucl_pred)
    v_measure_h[0][i] = metrics.adjusted_rand_score(Y, eucl_pred)
    mutual_h[0][i] = metrics.homogeneity_score(Y, eucl_pred)
    fowlkes_h[0][i] = metrics.completeness_score(Y, eucl_pred)
linkage = ['complete', 'average', 'single']
for i,link in enumerate(linkage):
    hierachical = AgglomerativeClustering(n_clusters=4, linkage=link, metric='cosine').fit(X)
    eucl_pred = hierachical.labels_
    a_rand_h[1][i+1] = metrics.rand_score(Y, eucl_pred)
    v_measure_h[1][i+1] = metrics.adjusted_rand_score(Y, eucl_pred)
    mutual_h[1][i+1] = metrics.homogeneity_score(Y, eucl_pred)
    fowlkes_h[1][i+1] = metrics.completeness_score(Y, eucl_pred)

In [40]:
a_rand = [0, np.min(a_rand[0]), np.mean(a_rand[0]), np.max(a_rand[0]), 0, np.min(a_rand[1]), np.mean(a_rand[1]), np.max(a_rand[1]),
          0, *a_rand_h[0], 0, *a_rand_h[1]]
v_measure = [0, np.min(v_measure[0]), np.mean(v_measure[0]), np.max(v_measure[0]), 0, np.min(v_measure[1]), np.mean(v_measure[1]), np.max(v_measure[1]),
             0, *v_measure_h[0], 0, *v_measure_h[1]] 
mutual = [0, np.min(mutual[0]), np.mean(mutual[0]), np.max(mutual[0]), 0, np.min(mutual[1]), np.mean(mutual[1]), np.max(mutual[1]),
          0, *mutual_h[0], 0, *mutual_h[1]]
fowlkes = [0, np.min(fowlkes[0]), np.mean(fowlkes[0]), np.max(fowlkes[0]), 0, np.min(fowlkes[1]), np.mean(fowlkes[1]), np.max(fowlkes[1]), 
           0, *fowlkes_h[0], 0, *fowlkes_h[1]]
table = pd.DataFrame({'a_rand': a_rand, 'v_measure': v_measure, 'mutual': mutual, 'fowlkes': fowlkes},
                     index=['k-means euclid', 'min', 'avrg', 'max', 'k-means cosine', 'min', 'avrg', 'max', 
                            'hierarchical euclid', 'ward', 'complete', 'average', 'single', 'hierarchical cosine', 'ward', 'complete', 'average', 'single'])
table.to_excel('LDA20news.xlsx', float_format="%.2f")