In [112]:
from sklearn.datasets import fetch_20newsgroups
from sklearn import metrics
# from sklearn.metrics import f1_score
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import TfidfModel
import re

import warnings
warnings.filterwarnings("ignore")

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  # 'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'],
                                      remove=("header",))

In [114]:
def get_res(dataset):

    results = [KMeans(n_clusters=4, random_state=42).fit(dataset),
               DBSCAN(eps=0.3, min_samples=5).fit(dataset),
               AgglomerativeClustering(n_clusters=4).fit(dataset)]

    names = ["K-means", "DBSCAN", "AgglomerativeClustering"]

    for i in range(len(results)):
        res = results[i]
        name = names[i]

        acc = metrics.homogeneity_completeness_v_measure(newsgroups_train.target, res.labels_)
        print_acc(name, "HCV-measure", acc, next_res=False, end=False)

        acc = metrics.silhouette_score(dataset, res.labels_, metric='euclidean')
        print_acc(name, "Silhouette-score", acc, next_res=True, end=False)

        acc = metrics.fowlkes_mallows_score(newsgroups_train.target, res.labels_)
        print_acc(name, "Fowlkes-Mallows-score", acc, next_res=True, end=True)

        if i < len(results) - 1:
            print("-" * 45)

In [64]:
def print_acc(name_method, name_metric, acc, next_res=False, end=True):

    if not next_res:
        print(name_method)
    else:
        print()
    len_space_method = len(name_method) // 2
    size = 40

    if end:
        sym = "└"
    else:
        sym = "├"

    if name_metric == "HCV-measure":
        len_space_metric = len_space_method + len("HCV-measure") // 2 + 5
        print(" " * len_space_method + sym + "─── HCV-measure")

        text = " " * len_space_metric + "├─── H-score:"
        num_space = size - len(text)

        print(" " * len_space_metric + "├─── H-score:" + " " * num_space + f"{round(acc[0], 3)}")
        print(" " * len_space_metric + "├─── C-score:" + " " * num_space + f"{round(acc[1], 3)}")
        print(" " * len_space_metric + "└─── V-score:" + " " * num_space + f"{round(acc[2], 3)}")
    else:
        text = " " * len_space_method + sym + f"─── {name_metric}:"
        text += " " * (size - len(text))
        print(f"{text}{round(acc, 3)}")

In [2]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/", "^"
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#", "="]
stop_words = stopwords.words('english') + special_sym


def delete_stopword_and_lemmatize(listw):
    res = []
    for word in listw:
        word = lemmatizer.lemmatize(word.lower())
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None\
            and not any(sym in word for sym in special_sym):
            res += [word]
    return res

In [3]:
def get_lsi(corpus, dictn):
    lsi_model = models.LsiModel(corpus=corpus, num_topics=20, id2word=dictn)
    lsi_res = []
    for i in range(len(corpus)):
        lsi_res += [[val[1] for val in lsi_model[corpus[i]]]]
    return lsi_res

In [5]:
tokenize_data = [delete_stopword_and_lemmatize(nltk.word_tokenize(newsgroups_train.data[i]))
                 for i in range(len(newsgroups_train.data))]

In [6]:
dictn = corpora.Dictionary(tokenize_data)
dictn.filter_extremes(keep_n=1000)

bow = [dictn.doc2bow(doc) for doc in tokenize_data]

model = TfidfModel(bow)
tfidf = [model[doc] for doc in bow]
lsi = get_lsi(tfidf, dictn)

In [115]:
get_res(lsi)

K-means
   ├─── HCV-measure
             ├─── H-score:              0.59
             ├─── C-score:              0.662
             └─── V-score:              0.624

   ├─── Silhouette-score:               0.141

   └─── Fowlkes-Mallows-score:          0.635
---------------------------------------------
DBSCAN
   ├─── HCV-measure
             ├─── H-score:              0.001
             ├─── C-score:              0.054
             └─── V-score:              0.002

   ├─── Silhouette-score:               0.304

   └─── Fowlkes-Mallows-score:          0.5
---------------------------------------------
AgglomerativeClustering
           ├─── HCV-measure
                     ├─── H-score:      0.515
                     ├─── C-score:      0.624
                     └─── V-score:      0.565

           ├─── Silhouette-score:       0.144

           └─── Fowlkes-Mallows-score:  0.6
