In [49]:
from sklearn.datasets import fetch_20newsgroups
from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn import preprocessing

import torch
from transformers import BertTokenizer, BertModel

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import TfidfModel
import re

import numpy as np

from tqdm import tqdm
from random import randint as rand

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  # 'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'],
                                      remove=("header",))

In [50]:
def get_acc(target, result, dataset):
    res = list(map(lambda x: round(x, 3), metrics.homogeneity_completeness_v_measure(target, result)))
    try:
        res += [round(metrics.silhouette_score(dataset, result, metric='euclidean'), 3)]
        res += [round(metrics.silhouette_score(dataset, result, metric='cosine'), 3)]
    except ValueError:
        res += [None, None]
    res += [round(metrics.adjusted_rand_score(target, result), 3)]
    return res

In [51]:
def print_acc(name_method, acc, ettab, tol=None):
    tab = ettab
    names = ["H-score", "C-score", "V-score", "Silhouette-score(euc)", "Silhouette-score(cos)", "Adjusted-rand-score"]

    if tol is None:
        tol = ["" for i in range(len(acc))]
    else:
        tol = [f" ± {tol[i]}" for i in range(len(acc))]

    print(f"{tab}{name_method}")
    tab += ettab

    for i in range(len(acc)):
        offset = 50 - len(tab) - len(names[i]) - 1
        print(f"{tab}{names[i]}:{' ' * offset}{acc[i]}{tol[i]}")

In [52]:
class Random_clust():
    def __init__(self):
        self.labels_ = []

    def fit(self, dataset, n_cls=4):
        self.labels_ = [rand(0, n_cls - 1) for i in range(len(dataset))]
        return self

In [53]:
def get_res(typename, dataset):
    print(typename)

    methods = {"Random": Random_clust(),
               "K-means(euc)": KMeans(n_clusters=4, n_init="auto"),
               "K-means(cos)": KMeans(n_clusters=4, n_init="auto"),
               "DBSCAN(euc)": DBSCAN(eps=0.3, min_samples=5, metric="euclidean"),
               "DBSCAN(cos)": DBSCAN(eps=0.3, min_samples=5, metric="cosine"),
               "AgglomerativeClustering(euc, ward)": AgglomerativeClustering(n_clusters=4, metric="euclidean"),
               "AgglomerativeClustering(cos, complete)": AgglomerativeClustering(n_clusters=4, metric="cosine", linkage="complete")}

    tab = " " * 7
    kmeans_it = 10
    norm_dataset = preprocessing.normalize(dataset)

    for key in methods.keys():

        method = methods[key]

        if key == "K-means(euc)" or key == "K-means(cos)":
            accs = []
            for i in range(kmeans_it):
                if key == "K-means(cos)":
                    accs += [get_acc(newsgroups_train.target, method.fit(norm_dataset).labels_, norm_dataset)]
                else:
                    accs += [get_acc(newsgroups_train.target, method.fit(dataset).labels_, dataset)]
            acc, tol = [], []
            for i in range(len(accs[0])):
                acc += [round(np.mean([accs[j][i] for j in range(kmeans_it)]), 3)]
                tol += [round(np.std([accs[j][i] for j in range(kmeans_it)]), 3)]
            print_acc(key, acc, tab, tol)
        else:
            acc = get_acc(newsgroups_train.target, method.fit(dataset).labels_, dataset)
            print_acc(key, acc, tab)

        if key != list(methods.keys())[-1]:
            print()

    print("-" * 100)

In [54]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/", "^"
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#", "="]
stop_words = stopwords.words('english') + special_sym


def delete_stopword_and_lemmatize(listw):
    res = []
    for word in listw:
        word = lemmatizer.lemmatize(word.lower())
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None\
            and not any(sym in word for sym in special_sym):
            res += [word]
    return res

In [55]:
NOUN_TAG = ['NN', "NNS"]
JJ_TAG = ["JJ", "JJR", "JJS"]
VB_TAG = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]


def only_pos(list_doc, pos_t_cls):
    len_docs = len(list_doc)
    cnt_cls = len(pos_t_cls)
    res = [[] for i in range(cnt_cls)]

    with tqdm(total=len_docs, position=0, leave=True) as pbar:
        for idxd in range(len_docs):

            pbar.set_description(f"Doc: {idxd+1}/{len_docs}")
            pbar.update()

            ndoc = [[] for i in range(cnt_cls)]
            for elem in nltk.pos_tag(list_doc[idxd]):
                for idxpos in range(cnt_cls):
                    if elem[1] in pos_t_cls[idxpos]:
                        ndoc[idxpos] += [elem[0]]

            for idxpos in range(cnt_cls):
                res[idxpos] += [ndoc[idxpos]]
    return tuple(res)

In [56]:
def get_bow_var(bow, dictn):
    res = []
    for i in range(len(bow)):
        doc = [0 for j in range(len(dictn))]
        for val in bow[i]:
            doc[val[0]] = val[1]

        res += [doc]
    return res


def get_tfidf_var(tfidf, dictn):
    res = []
    for i in range(len(tfidf)):
        doc = [0 for j in range(len(dictn))]
        for val in tfidf[i]:
            doc[val[0]] = val[1]

        res += [doc]
    return res

In [57]:
def get_lsi(corpus, dictn, num_topics=20):
    lsi_model = models.LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictn)
    lsi_res = []
    for i in range(len(corpus)):
        lsi_res += [[val[1] for val in lsi_model[corpus[i]]]]
    return lsi_res


def get_lda(corpus, dictn, alpha='symmetric', bbeta=None, num_topics=20):
    lda_model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictn, passes=10, alpha=alpha, eta=bbeta)
    lda_res = []
    for i in range(len(corpus)):
        lda_res += [[val[1] for val in lda_model.get_document_topics(corpus[i], minimum_probability=0.0)]]
    return lda_res

In [58]:
tokenize_data = [delete_stopword_and_lemmatize(nltk.word_tokenize(newsgroups_train.data[i]))
                 for i in range(len(newsgroups_train.data))]

In [59]:
dictn = corpora.Dictionary(tokenize_data)
dictn.filter_extremes(keep_n=1000)

bow = [dictn.doc2bow(doc) for doc in tokenize_data]

model = TfidfModel(bow)
tfidf = [model[doc] for doc in bow]

In [60]:
bow_res = {"BoW": get_bow_var(bow, dictn)}
tf_idf_res = {"TF-IDF": get_tfidf_var(tfidf, dictn)}
Lsi_res = {"LSI(ntop=5)": get_lsi(tfidf, dictn, 5),
           "LSI(ntop=20)": get_lsi(tfidf, dictn, 20),
           "LSI(ntop=50)": get_lsi(tfidf, dictn, 50)}

In [61]:
Lda_res = {"LDA(ntop=5, alpha=symmetric, beta=None)": get_lda(bow, dictn, 'symmetric', None, num_topics=5),
           "LDA(ntop=20, alpha=symmetric, beta=None)": get_lda(bow, dictn, 'symmetric', None, num_topics=20),
           "LDA(ntop=50, alpha=symmetric, beta=None)": get_lda(bow, dictn, 'symmetric', None, num_topics=50),
           "LDA(ntop=5, alpha=0.1, beta=0.1)": get_lda(bow, dictn, 0.1, 0.1, num_topics=5),
           "LDA(ntop=5, alpha=0.5, beta=0.1)": get_lda(bow, dictn, 0.5, 0.1, num_topics=5),
           "LDA(ntop=5, alpha=0.5, beta=0.5)": get_lda(bow, dictn, 0.5, 0.5, num_topics=5),
           "LDA(ntop=5, alpha=2, beta=0.1)": get_lda(bow, dictn, 2, 0.1, num_topics=5),
           "LDA(ntop=5, alpha=2, beta=0.5)": get_lda(bow, dictn, 2, 0.5, num_topics=5),
           "LDA(ntop=5, alpha=0.1, beta=2)": get_lda(bow, dictn, 0.1, 2, num_topics=5)}

In [62]:
tokenize_data_noun, tokenize_data_noun_jj_vb = only_pos(tokenize_data, [NOUN_TAG, NOUN_TAG + JJ_TAG + VB_TAG])

Doc: 3765/3765: 100%|██████████████████████████████████████████████████████████████| 3765/3765 [01:09<00:00, 53.80it/s]


In [63]:
dictn_noun, dictn_noun_jj_vb = corpora.Dictionary(tokenize_data_noun), corpora.Dictionary(tokenize_data_noun_jj_vb)
dictn_noun.filter_extremes(keep_n=1000)
dictn_noun_jj_vb.filter_extremes(keep_n=1000)

bow_noun = [dictn_noun.doc2bow(doc) for doc in tokenize_data_noun]
bow_noun_jj_vb = [dictn_noun_jj_vb.doc2bow(doc) for doc in tokenize_data_noun_jj_vb]

model = TfidfModel(bow_noun)
tfidf_noun = [model[doc] for doc in bow_noun]

model = TfidfModel(bow_noun_jj_vb)
tfidf_noun_jj_vb = [model[doc] for doc in bow_noun_jj_vb]

In [64]:
bow_res.update({"BoW(NOUN)": get_bow_var(bow_noun, dictn_noun),
                "BoW(NOUN_JJ_VB)": get_bow_var(bow_noun_jj_vb, dictn_noun_jj_vb)})
tf_idf_res.update({"TF-IDF(NOUN)": get_tfidf_var(tfidf_noun, dictn_noun),
                   "TF-IDF(NOUN_JJ_VB)": get_tfidf_var(tfidf_noun_jj_vb, dictn_noun_jj_vb)})
Lsi_res.update({"LSI(NOUN, ntop=5)": get_lsi(tfidf_noun, dictn_noun, 5),
                "LSI(NOUN_JJ_VB, ntop=5)": get_lsi(tfidf_noun_jj_vb, dictn_noun_jj_vb, 5)})
Lda_res.update({"LDA(NOUN, ntop=5, alpha=symmetric, beta=None)": get_lda(bow_noun, dictn_noun, num_topics=5),
                "LDA(NOUN_JJ_VB, ntop=5, alpha=symmetric, beta=None)": get_lda(bow_noun_jj_vb, dictn_noun_jj_vb, num_topics=5)})

In [65]:
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-uncased")
model_bert = BertModel.from_pretrained("bert-base-uncased")
preproc_data_bert = [tokenizer_bert(" ".join(tokenize_data[i]), padding=True, truncation=True, return_tensors="pt")
                     for i in range(len(tokenize_data))]

In [66]:
with torch.no_grad():
    vec_data_bert = []
    len_docs = len(tokenize_data)

    with tqdm(total=len_docs, position=0, leave=True) as pbar:
        for idxdoc in range(len_docs):

            pbar.set_description(f"Doc: {idxdoc+1}/{len_docs}")
            pbar.update()

            vec_data_bert += [model_bert(**preproc_data_bert[idxdoc]).last_hidden_state[:, 0, :].tolist()[0]]

bert_res = {"Bert-model": vec_data_bert}

Doc: 3765/3765: 100%|██████████████████████████████████████████████████████████████| 3765/3765 [26:00<00:00,  2.41it/s]


In [67]:
reses = [bow_res, tf_idf_res, Lsi_res, Lda_res, bert_res]

In [68]:
for res in reses:
    for key in res.keys():
        get_res(key, res[key])

BoW
       Random
              H-score:                            0.001
              C-score:                            0.001
              V-score:                            0.001
              Silhouette-score(euc):              -0.014
              Silhouette-score(cos):              -0.002
              Adjusted-rand-score:                -0.0

       K-means(euc)
              H-score:                            0.007 ± 0.004
              C-score:                            0.095 ± 0.033
              V-score:                            0.012 ± 0.007
              Silhouette-score(euc):              0.698 ± 0.132
              Silhouette-score(cos):              -0.069 ± 0.012
              Adjusted-rand-score:                0.003 ± 0.002

       K-means(cos)
              H-score:                            0.385 ± 0.078
              C-score:                            0.423 ± 0.067
              V-score:                            0.403 ± 0.073
              Silhouette-s