In [1]:
import os
import argparse

import pandas as pd
import numpy as np

import plotly.express as px
import matplotlib.pyplot as plt

from stopwords import get_stopwords

from collections import defaultdict, Counter

from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [3]:
def tokenize_extend(list_of_texts):
    tokens_list = []
    for item in list_of_texts:
        tokens_list.extend(str(item).split())
    return tokens_list


def cluster_printer(labels, texts, path_to_save, n_words=10):
    label_text = defaultdict(list,{l: [] for l in labels})
    sw = get_stopwords('russian')
    sw += ['могу']
    for text, label in zip(texts, labels):
        label_text[label].append(text)
    cluster_names = dict()
    df = pd.DataFrame()
    labels, sizes, word_cloud = [], [], []
    for label, text in label_text.items():
        corpus = tokenize_extend(text)
        collected = 0
        result_name = ''
        for word, count in Counter(corpus).most_common(100):
            if collected >= n_words:
                break
            if word not in sw:
                result_name += word + '_' + str(count) + ' | '
                collected += 1
        labels.append(label)
        sizes.append(len(text))
        word_cloud.append(result_name)
        cluster_names[f'label_{label}_size_{len(text)}'] = result_name
    df = pd.DataFrame([labels, sizes, word_cloud]).T
    df.columns = ['labels', 'sizes', 'words']
    df = df.sort_values(by='sizes', ascending=False, ignore_index=True)
    df.to_excel(os.path.join(path_to_save, f'clusterization_{len(set(labels))}_{FLAG}.xlsx'), index=False)
    result = dict(sorted(cluster_names.items(), key=lambda x: int(x[0].split('_')[-1]), reverse=True))
    return result


def clusterize_viz(spec, X_lite, texts_path, n_docs_from, n_docs_to, n_words, path_to_save):
    
    text_df = pd.read_csv(texts_path).iloc[n_docs_from:n_docs_to]
    texts = text_df.messages_clean.values.tolist()
    scaler = StandardScaler()
    tsne = TSNE(random_state=17)
    data_scaled = scaler.fit_transform(X_lite)
    tsne_data = tsne.fit_transform(X_lite)
    plt.figure(figsize=(6, 6), dpi=80)
    plt.title("t-SNE")
    plt.scatter(tsne_data[:, 0], tsne_data[:, 1], alpha=0.5,s=15)
    labels = spec.fit_predict(X_lite)
    text_df['labels'] = labels
    new_path =  texts_path.split('.')[0].split('/')[-1] + f'_annotated_{FLAG}.xlsx'
    texts_path_modified = os.path.join(path_to_save, new_path)
    text_df.to_excel(texts_path_modified, index=False)
    print("silhouette_score = ", silhouette_score(X_lite, labels, random_state=666))
    
    ngram_pd=pd.DataFrame(X_lite)
    vec_numpy = tsne.fit_transform(ngram_pd)
    vec_numpy = pd.DataFrame(vec_numpy)
    vec_numpy['label'] = spec.labels_
    cluster_mean=vec_numpy.groupby(['label']).mean().reset_index()
    count_array= np.empty(np.unique(labels).shape[0])
    print("number clusters = ", np.unique(labels).shape[0])
    
    for label in range(np.unique(labels).shape[0]):   #количество векторов в каждом кластере
        occurences=np.count_nonzero(spec.labels_ == label)
        count_array[label]=occurences
    cluster_mean['count']=count_array  
    cluster_mean.columns=['label','x','y','count']
    
    fig = px.scatter(data_frame=cluster_mean, 
                     x=cluster_mean['x'], 
                     y=cluster_mean['y'], 
                     size="count",
                     hover_name="label")

    cluster_meta = cluster_printer(labels, texts, path_to_save, n_words)
    fig.show()
    return cluster_meta


def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [4]:
n_docs_from = 0
n_docs_to = 10_000
n_words = 10
distance_threshold = 0.4


embeddings_path = '/hdd/docker_shared_folder/chatbot-data/embeddings/rubert/ru_emb_100000_historical-chatbot-diarized.csv'
texts_path      = '/hdd/docker_shared_folder/chatbot-data/data/historical-chatbot-diarized.csv'
path_to_save    = '/hdd/docker_shared_folder/chatbot-data/clusterization'
FLAG            = f'almat_cb_{n_docs_from}_{n_docs_to}'


X = np.fromfile(embeddings_path, dtype=float, sep="\n")

dim = 768
X.resize(X.shape[0] // dim, dim)
X = X[n_docs_from:n_docs_to]

In [None]:
agl = AgglomerativeClustering(
                affinity="cosine", 
                distance_threshold=0.3, 
                n_clusters=None, 
                linkage="complete"
)


cluster_meta = clusterize_viz(spec=agl, 
                              X_lite=X, 
                              texts_path=texts_path, 
                              n_docs_from=n_docs_from, 
                              n_docs_to=n_docs_to, 
                              n_words=n_words,
                              path_to_save=path_to_save
                              )

silhouette_score =  0.03300577778238554


In [None]:
for key, value in cluster_meta.items():
    print(key, value)