In [3]:
import spacy
from spacy.lang.ru import stop_words

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import DistanceMetric
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import re

In [4]:
nlp = spacy.load("ru_core_news_sm")
vectorizer = TfidfVectorizer()

In [5]:
patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"

comments = []
with open("./text.txt", mode="r", encoding="utf8") as f:
    for line in f.readlines():
        if (line != '\n' and not line.startswith('[')):
            string = re.sub(patterns, ' ', line.strip())
            comments.append(string.lower())

comments

['будет солнечно',
 'солнечно до конца дня  в красносельском районе как всегда ветер',
 'сегодня будет солнечно  ясное небо  ни облачка не появится  ночь чистая  можно будет даже звёзды увидеть',
 'температура опустится сразу на   градусов  так что на улице днем будет не выше плюс   ощущение холода будут усиливать дожди  которые начнутся в последний день этой недели и затем будут идти несколько суток',
 'сегодня ясно  без осадков  атмосферное давление выше нормы ',
 'сегодня довольно теплая для сентября погода  небо ясное и осадков не ожидается  так что прогноз на сегодня очень приятный ',
 'в санкт петербурге сегодня ожидается   °  без осадков  легкий ветер ',
 'трудно с ходу назвать фильм  который породил столько же абсурдных пародий  данное творение несмотря на свою «трешовость»  снискал огромное количество поклонников',
 'трагикомедия  которая показывает последние дни двух молодых людей со смертельными диагнозами',
 'фильм явно тянет на премию оскар  проработанный сюжет  очень крас

In [7]:
def tokenize(text):
    stops = stop_words.STOP_WORDS
    doc = nlp(text)
    lemmatized_tokens = [token for token in doc if not token.is_punct and not token.is_space and token not in stops]

    return lemmatized_tokens

def filter_tokens(tokens, parts_of_speech):
    filtered = [token for token in tokens if token.pos_ in parts_of_speech]

    return filtered

def make_texts(all_tokens):
    texts = []
    for tokens in all_tokens:
        texts.append(' '.join([token.lemma_ for token in tokens]))
    return texts

def visualize_distance_matrix(matrix, texts, name_of_distance, name_of_tokens):
    fig, ax = plt.subplots()
    cax = ax.matshow(matrix, cmap='viridis')
    fig.colorbar(cax)

    ax.set_xticks(np.arange(len(texts)))
    ax.set_yticks(np.arange(len(texts)))

    for i in range(len(texts)):
        for j in range(len(texts)):
            value = f"{matrix[i, j]:.2f}"
            ax.text(j, i, value, ha='center', va='center')

    plt.xlabel('Text Index (Column)')
    plt.ylabel('Text Index (Row)')
    plt.title("metric: " + name_of_distance + ", " + name_of_tokens)
    plt.show()

In [10]:
def cluster_kmeans(matrix, metrics, true_labels, num_clusters=2, num_iterations=50):
    scores = {}
    for metric in metrics:
        scores.update({metric.__name__: []})

    for i in range(num_iterations):
        clusters = KMeans(n_clusters=num_clusters, random_state=i, n_init=10)

        clusters.fit(matrix)
        
        for metric in metrics:
            score = metric(true_labels, clusters.labels_)
            scores[metric.__name__].append(score)

    kmeans_res = ""
    for metric in scores:
        kmeans_res += f"\n{metric} \nMax: {np.max(scores[metric])} \
                                    \nMin: {np.min(scores[metric])} \
                                    \nAVG: {np.mean(scores[metric])} \n"
        
    return kmeans_res

In [11]:
def cluster_hierarchy(matrix, metrics, true_labels, num_clusters=2):
    linkages = ["complete", "average", "single"]

    hierarchy_res = ""

    if matrix.shape[0] != matrix.shape[1]:
        affinity = "euclidean"
        matrix = matrix.toarray()
        linkages.append("ward")
    else:
        affinity = "precomputed"

    for linkage in linkages:
        hierarchy_res += f"\n{linkage}"

        agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric=affinity, linkage=linkage)

        agg_clustering.fit(matrix)

        for metric in metrics:
            score = metric(true_labels, agg_clustering.labels_)
            hierarchy_res += f"\n{metric.__name__}: {score}"
        
        hierarchy_res += "\n"
    
    return hierarchy_res

In [22]:
def main(list_of_text, vectorize, is_visualize_distance_matrix=False):
    true_labels = [0]*7+[1]*5
    distances = ["euclidean", "jaccard", "cosine"]
    metrics = [normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score]

    kmeans_data = pd.DataFrame(columns=distances)
    hierarchy_data = pd.DataFrame(columns=distances)

    all_tokens = [tokenize(comment) for comment in list_of_text]

    noun_tokens = [filter_tokens(tokens, ["NOUN"]) for tokens in all_tokens]
    adj_tokens = [filter_tokens(tokens, ["ADJ"]) for tokens in all_tokens]
    noun_adj_tokens = [filter_tokens(tokens, ["NOUN", "ADJ"]) for tokens in all_tokens]

    list_of_tokens = {"ALL": all_tokens, "NOUNS": noun_tokens, "ADJ": adj_tokens, "NOUNS and ADJ": noun_adj_tokens}

    for name, tokens in list_of_tokens.items():
        if vectorize == "TF-IDF":
            texts = make_texts(tokens)
            text_vectors = vectorizer.fit_transform(texts)
        else:
            text_vectors = [[token.vector for token in tokens] for tokens in all_tokens]
        
        for distance in distances:
            if distance == "euclidean":
                distance_matrix = euclidean_distances(text_vectors)
            elif distance == "jaccard":
                dist = DistanceMetric.get_metric(distance)
                distance_matrix = dist.pairwise(text_vectors)
            elif distance == "cosine":
                distance_matrix = cosine_distances(text_vectors)
            elif distance == "none":
                distance_matrix = text_vectors

            if is_visualize_distance_matrix:
                visualize_distance_matrix(distance_matrix, list_of_text, distance, name)
            
            kmeans_data.loc[name, distance] = cluster_kmeans(distance_matrix, metrics, true_labels)

            hierarchy_data.loc[name, distance] = cluster_hierarchy(distance_matrix, metrics, true_labels)

    return kmeans_data, hierarchy_data


In [23]:
kmeans_data, hierarchy_data = main(comments, "TF-IDF")

In [24]:
from IPython.display import display

display(hierarchy_data.style.set_properties(**{
    'text-align': 'center',
    'white-space': 'pre-wrap',
}))

Unnamed: 0,euclidean,jaccard,cosine
ALL,complete normalized_mutual_info_score: 0.45413961418683363 adjusted_rand_score: 0.40054495912806537 v_measure_score: 0.4541396141868337 homogeneity_score: 0.41507127766752966 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771,complete normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771,complete normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771
NOUNS,complete normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771,complete normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771,complete normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771
ADJ,complete normalized_mutual_info_score: 0.06875251074718794 adjusted_rand_score: 0.04087193460490463 v_measure_score: 0.06875251074718793 homogeneity_score: 0.06283792822122235 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771,complete normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771,complete normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771
NOUNS and ADJ,complete normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771,complete normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771,complete normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 average normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771 single normalized_mutual_info_score: 0.16217929907568504 adjusted_rand_score: 0.06796116504854369 v_measure_score: 0.16217929907568504 homogeneity_score: 0.11533532084453771


In [19]:
kmeans_data.to_excel("kmeans_res1.xlsx")

In [26]:
hierarchy_data.to_excel("hierarchy_res1.xlsx")