In [38]:
from sklearn.datasets import fetch_20newsgroups
from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import TfidfModel
import re

from tqdm import tqdm
from random import randint as rand

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  # 'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'],
                                      remove=("header",))


# добавить Bert, в K-means добавить случай матрицы расстояний через cos
# и считать оценку на основе многих испытаний,
# в AgglomerativeClustering посмотреть различный варианты счета расстояний

In [46]:
def get_res(typename, dataset):
    print(typename)

    results = [[rand(0, 3) for i in range(len(dataset))],
               KMeans(n_clusters=4, random_state=42, n_init="auto").fit(dataset),
               DBSCAN(eps=0.3, min_samples=5, metric="euclidean").fit(dataset),
               DBSCAN(eps=0.3, min_samples=5, metric="cosine").fit(dataset),
               AgglomerativeClustering(n_clusters=4).fit(dataset)]

    names = ["Random", "K-means", "DBSCAN(euc)", "DBSCAN(cos)", "AgglomerativeClustering"]
    tab = " " * 7

    for i in range(len(results)):
        if i == 0:
            res = results[i]
        else:
            res = results[i].labels_
        name = names[i]

        acc = metrics.homogeneity_completeness_v_measure(newsgroups_train.target, res)
        print_acc(name, "HCV-measure", acc, tab, next_res=False)

        try:
            acc = metrics.silhouette_score(dataset, res, metric='euclidean')
            print_acc(name, "Silhouette-score(euc)", acc, tab, next_res=True)

            acc = metrics.silhouette_score(dataset, res, metric='cosine')
            print_acc(name, "Silhouette-score(cos)", acc, tab, next_res=True)
        except ValueError:
            print_acc(name, "Silhouette-score", "error 1 cluster", tab, next_res=True)

        acc = metrics.fowlkes_mallows_score(newsgroups_train.target, res)
        print_acc(name, "Fowlkes-Mallows-score", acc, tab, next_res=True)

        if i < len(results) - 1:
            print()

    print("-" * 100)

In [22]:
def print_acc(name_method, name_metric, acc, ettab, next_res=False):
    tab = ettab

    if not next_res:
        print(f"{tab}{name_method}")
    else:
        print()

    tab += ettab

    if name_metric == "HCV-measure":
        print(f"{tab}HCV-measure")
        tab += ettab

        print(f"{tab}H-score:{tab}{round(acc[0], 3)}")
        print(f"{tab}C-score:{tab}{round(acc[1], 3)}")
        print(f"{tab}V-score:{tab}{round(acc[2], 3)}")
    else:
        try:
            print(f"{tab}{name_metric}:{tab}{round(acc, 3)}")
        except TypeError:
            print(f"{tab}{name_metric}:{tab}{acc}")

In [3]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/", "^"
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#", "="]
stop_words = stopwords.words('english') + special_sym


def delete_stopword_and_lemmatize(listw):
    res = []
    for word in listw:
        word = lemmatizer.lemmatize(word.lower())
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None\
            and not any(sym in word for sym in special_sym):
            res += [word]
    return res

In [32]:
NOUN_TAG = ['NN', "NNS"]
JJ_TAG = ["JJ", "JJR", "JJS"]
VB_TAG = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]


def only_pos(list_doc, pos_t_cls):
    len_docs = len(list_doc)
    cnt_cls = len(pos_t_cls)
    res = [[] for i in range(cnt_cls)]

    with tqdm(total=len_docs, position=0, leave=True) as pbar:
        for idxd in range(len_docs):

            pbar.set_description(f"Doc: {idxd+1}/{len_docs}")
            pbar.update()

            ndoc = [[] for i in range(cnt_cls)]
            for elem in nltk.pos_tag(list_doc[idxd]):
                for idxpos in range(cnt_cls):
                    if elem[1] in pos_t_cls[idxpos]:
                        ndoc[idxpos] += [elem[0]]

            for idxpos in range(cnt_cls):
                res[idxpos] += [ndoc[idxpos]]
    return tuple(res)

In [4]:
def get_bow_var(bow, dictn):
    res = []
    for i in range(len(bow)):
        doc = [0 for j in range(len(dictn))]
        for val in bow[i]:
            doc[val[0]] = val[1]

        res += [doc]
    return res


def get_tfidf_var(tfidf, dictn):
    res = []
    for i in range(len(tfidf)):
        doc = [0 for j in range(len(dictn))]
        for val in tfidf[i]:
            doc[val[0]] = val[1]

        res += [doc]
    return res

In [6]:
def get_lsi(corpus, dictn, num_topics=20):
    lsi_model = models.LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictn)
    lsi_res = []
    for i in range(len(corpus)):
        lsi_res += [[val[1] for val in lsi_model[corpus[i]]]]
    return lsi_res


def get_lda(corpus, dictn, alpha='symmetric', bbeta=None, num_topics=20):
    lda_model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictn, passes=10, alpha=alpha, eta=bbeta)
    lda_res = []
    for i in range(len(corpus)):
        lda_res += [[val[1] for val in lda_model.get_document_topics(corpus[i], minimum_probability=0.0)]]
    return lda_res

In [7]:
tokenize_data = [delete_stopword_and_lemmatize(nltk.word_tokenize(newsgroups_train.data[i]))
                 for i in range(len(newsgroups_train.data))]

In [8]:
dictn = corpora.Dictionary(tokenize_data)
dictn.filter_extremes(keep_n=1000)

bow = [dictn.doc2bow(doc) for doc in tokenize_data]

model = TfidfModel(bow)
tfidf = [model[doc] for doc in bow]

In [36]:
bow_res = {"BoW": get_bow_var(bow, dictn)}
tf_idf_res = {"TF-IDF": get_tfidf_var(tfidf, dictn)}
Lsi_res = {"LSI(ntop=5)": get_lsi(tfidf, dictn, 5),
           "LSI(ntop=20)": get_lsi(tfidf, dictn, 20),
           "LSI(ntop=50)": get_lsi(tfidf, dictn, 50)}

In [37]:
Lda_res = {"LDA(ntop=5, alpha=symmetric, beta=None)": get_lda(bow, dictn, 'symmetric', None, num_topics=5),
           "LDA(ntop=20, alpha=symmetric, beta=None)": get_lda(bow, dictn, 'symmetric', None, num_topics=20),
           "LDA(ntop=50, alpha=symmetric, beta=None)": get_lda(bow, dictn, 'symmetric', None, num_topics=50),
           "LDA(ntop=20, alpha=0.1, beta=0.1)": get_lda(bow, dictn, 0.1, 0.1, num_topics=20),
           "LDA(ntop=20, alpha=0.5, beta=0.1)": get_lda(bow, dictn, 0.5, 0.1, num_topics=20),
           "LDA(ntop=20, alpha=0.5, beta=0.5)": get_lda(bow, dictn, 0.5, 0.5, num_topics=20),
           "LDA(ntop=20, alpha=2, beta=0.1)": get_lda(bow, dictn, 2, 0.1, num_topics=20),
           "LDA(ntop=20, alpha=2, beta=0.5)": get_lda(bow, dictn, 2, 0.5, num_topics=20),
           "LDA(ntop=20, alpha=0.1, beta=2)": get_lda(bow, dictn, 0.1, 2, num_topics=20)}

In [33]:
tokenize_data_noun, tokenize_data_noun_jj_vb = only_pos(tokenize_data, [NOUN_TAG, NOUN_TAG + JJ_TAG + VB_TAG])

Doc: 3765/3765: 100%|██████████████████████████████████████████████████████████████| 3765/3765 [01:14<00:00, 50.87it/s]


In [40]:
dictn_noun, dictn_noun_jj_vb = corpora.Dictionary(tokenize_data_noun), corpora.Dictionary(tokenize_data_noun_jj_vb)
dictn_noun.filter_extremes(keep_n=1000)
dictn_noun_jj_vb.filter_extremes(keep_n=1000)

bow_noun = [dictn_noun.doc2bow(doc) for doc in tokenize_data_noun]
bow_noun_jj_vb = [dictn_noun_jj_vb.doc2bow(doc) for doc in tokenize_data_noun_jj_vb]

model = TfidfModel(bow_noun)
tfidf_noun = [model[doc] for doc in bow_noun]

model = TfidfModel(bow_noun_jj_vb)
tfidf_noun_jj_vb = [model[doc] for doc in bow_noun_jj_vb]

In [41]:
bow_res.update({"BoW(NOUN)": get_bow_var(bow_noun, dictn_noun),
                "BoW(NOUN_JJ_VB)": get_bow_var(bow_noun_jj_vb, dictn_noun_jj_vb)})
tf_idf_res.update({"TF-IDF(NOUN)": get_tfidf_var(tfidf_noun, dictn_noun),
                   "TF-IDF(NOUN_JJ_VB)": get_tfidf_var(tfidf_noun_jj_vb, dictn_noun_jj_vb)})
Lsi_res.update({"LSI(NOUN, ntop=20)": get_lsi(tfidf_noun, dictn_noun, 20),
                "LSI(NOUN_JJ_VB, ntop=20)": get_lsi(tfidf_noun_jj_vb, dictn_noun_jj_vb, 20)})
Lda_res.update({"LDA(NOUN, ntop=20, alpha=symmetric, beta=None)": get_lda(bow_noun, dictn_noun),
                "LDA(NOUN_JJ_VB, ntop=20, alpha=symmetric, beta=None)": get_lda(bow_noun_jj_vb, dictn_noun_jj_vb)})

In [45]:
Lda_res.update({"LDA(NOUN, ntop=5, alpha=symmetric, beta=None)": get_lda(bow_noun, dictn_noun, num_topics=5),
                "LDA(NOUN_JJ_VB, ntop=5, alpha=symmetric, beta=None)": get_lda(bow_noun_jj_vb, dictn_noun_jj_vb, num_topics=5)})

In [47]:
reses = [bow_res, tf_idf_res, Lsi_res, Lda_res]

In [48]:
for res in reses:
    for key in res.keys():
        get_res(key, res[key])

BoW
       Random
              HCV-measure
                     H-score:                     0.0
                     C-score:                     0.0
                     V-score:                     0.0

              Silhouette-score(euc):              -0.006

              Silhouette-score(cos):              -0.002

              Fowlkes-Mallows-score:              0.251

       K-means
              HCV-measure
                     H-score:                     0.01
                     C-score:                     0.098
                     V-score:                     0.018

              Silhouette-score(euc):              0.653

              Silhouette-score(cos):              -0.098

              Fowlkes-Mallows-score:              0.49

       DBSCAN(euc)
              HCV-measure
                     H-score:                     0.0
                     C-score:                     1.0
                     V-score:                     0.0

              Silhouette-score: 