In [1]:
import re
import glob
import pandas as pd
import os
import pickle
import numpy as np
from datetime import datetime
import codecs

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from scipy.sparse.linalg import svds
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 4.0 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.2


In [4]:
import unidecode

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Wishart clustering function
# https://github.com/Radi4/BotDetection/blob/master/Wishart.py

import numpy as np
from scipy.special import gamma
from sklearn.neighbors import KDTree
from collections import defaultdict
from tqdm import tqdm

class Wishart:
    def __init__(self, wishart_neighbors, significance_level):
        self.wishart_neighbors = wishart_neighbors  # Number of neighbors
        self.significance_level = significance_level  # Significance level

    def fit(self, X):
        from sklearn.neighbors import KDTree
        kdt = KDTree(X, metric='euclidean')

        #add one because you are your neighb.
        distances, neighbors = kdt.query(X, k = self.wishart_neighbors + 1, return_distance = True)
        neighbors = neighbors[:, 1:]


        distances = distances[:, -1]
        indexes = np.argsort(distances)
        
        size, dim = X.shape

        self.object_labels = np.zeros(size, dtype = int) - 1

        #index in tuple
        #min_dist, max_dist, flag_to_significant
        self.clusters = np.array([(1., 1., 0)])
        self.clusters_to_objects = defaultdict(list)
        print('Start clustering')

        for index in indexes:
            neighbors_clusters =\
                np.concatenate([self.object_labels[neighbors[index]], self.object_labels[neighbors[index]]])
            unique_clusters = np.unique(neighbors_clusters).astype(int)
            unique_clusters = unique_clusters[unique_clusters != -1]


            if len(unique_clusters) == 0:
                self._create_new_cluster(index, distances[index])
            else:
                max_cluster = unique_clusters[-1]
                min_cluster = unique_clusters[0]
                if max_cluster == min_cluster:
                    if self.clusters[max_cluster][-1] < 0.5:
                        self._add_elem_to_exist_cluster(index, distances[index], max_cluster)
                    else:
                        self._add_elem_to_noise(index)
                else:
                    my_clusters = self.clusters[unique_clusters]
                    flags = my_clusters[:, -1]
                    if np.min(flags) > 0.5:
                        self._add_elem_to_noise(index)
                    else:
                        significan = np.power(my_clusters[:, 0], -dim) - np.power(my_clusters[:, 1], -dim)
                        significan *= self.wishart_neighbors
                        significan /= size
                        significan /= np.power(np.pi, dim / 2)
                        significan *= gamma(dim / 2 + 1)
                        significan_index = significan >= self.significance_level

                        significan_clusters = unique_clusters[significan_index]
                        not_significan_clusters = unique_clusters[~significan_index]
                        significan_clusters_count = len(significan_clusters)
                        if significan_clusters_count > 1 or min_cluster == 0:
                            self._add_elem_to_noise(index)
                            self.clusters[significan_clusters, -1] = 1
                            for not_sig_cluster in not_significan_clusters:
                                if not_sig_cluster == 0:
                                    continue

                                for bad_index in self.clusters_to_objects[not_sig_cluster]:
                                    self._add_elem_to_noise(bad_index)
                                self.clusters_to_objects[not_sig_cluster].clear()
                        else:
                            for cur_cluster in unique_clusters:
                                if cur_cluster == min_cluster:
                                    continue

                                for bad_index in self.clusters_to_objects[cur_cluster]:
                                    self._add_elem_to_exist_cluster(bad_index, distances[bad_index], min_cluster)
                                self.clusters_to_objects[cur_cluster].clear()

                            self._add_elem_to_exist_cluster(index, distances[index], min_cluster)

        return self.clean_data()

    def clean_data(self):
        unique = np.unique(self.object_labels)
        index = np.argsort(unique)
        if unique[0] != 0:
            index += 1
        true_cluster = {unq :  index for unq, index in zip(unique, index)}
        result = np.zeros(len(self.object_labels), dtype = int)
        for index, unq in enumerate(self.object_labels):
            result[index] = true_cluster[unq]
        return result

    def _add_elem_to_noise(self, index):
        self.object_labels[index] = 0
        self.clusters_to_objects[0].append(index)

    def _create_new_cluster(self, index, dist):
        self.object_labels[index] = len(self.clusters)
        self.clusters_to_objects[len(self.clusters)].append(index)
        self.clusters = np.append(self.clusters, [(dist, dist, 0)], axis = 0)

    def _add_elem_to_exist_cluster(self, index, dist, cluster_label):
        self.object_labels[index] = cluster_label
        self.clusters_to_objects[cluster_label].append(index)
        self.clusters[cluster_label][0] = min(self.clusters[cluster_label][0], dist)
        self.clusters[cluster_label][1] = max(self.clusters[cluster_label][1], dist)


class PreTrainWishart:
    def __init__(self, wishart_neighbors, significance_level, distances, neighbors):
        self.wishart_neighbors = wishart_neighbors  # Number of neighbors
        self.significance_level = significance_level  # Significance level
        self.distances = distances
        self.neighbors = neighbors

    def fit(self, X):
        from sklearn.neighbors import KDTree
        kdt = KDTree(X, metric='euclidean')

        #add one because you are your neighb.
        neighbors = self.neighbors[:, 1 : self.wishart_neighbors + 1]
        distances = self.distances[:, self.wishart_neighbors]
        indexes = np.argsort(distances)
        
        size, dim = X.shape

        self.object_labels = np.zeros(size, dtype = int) - 1

        #index in tuple
        #min_dist, max_dist, flag_to_significant
        self.clusters = np.array([(1., 1., 0)])
        self.clusters_to_objects = defaultdict(list)

        for index in indexes:
            neighbors_clusters =\
                np.concatenate([self.object_labels[neighbors[index]], self.object_labels[neighbors[index]]])
            unique_clusters = np.unique(neighbors_clusters).astype(int)
            unique_clusters = unique_clusters[unique_clusters != -1]


            if len(unique_clusters) == 0:
                self._create_new_cluster(index, distances[index])
            else:
                max_cluster = unique_clusters[-1]
                min_cluster = unique_clusters[0]
                if max_cluster == min_cluster:
                    if self.clusters[max_cluster][-1] < 0.5:
                        self._add_elem_to_exist_cluster(index, distances[index], max_cluster)
                    else:
                        self._add_elem_to_noise(index)
                else:
                    my_clusters = self.clusters[unique_clusters]
                    flags = my_clusters[:, -1]
                    if np.min(flags) > 0.5:
                        self._add_elem_to_noise(index)
                    else:
                        significan = np.power(my_clusters[:, 0], -dim) - np.power(my_clusters[:, 1], -dim)
                        significan *= self.wishart_neighbors
                        significan /= size
                        significan /= np.power(np.pi, dim / 2)
                        significan *= gamma(dim / 2 + 1)
                        significan_index = significan >= self.significance_level

                        significan_clusters = unique_clusters[significan_index]
                        not_significan_clusters = unique_clusters[~significan_index]
                        significan_clusters_count = len(significan_clusters)
                        if significan_clusters_count > 1 or min_cluster == 0:
                            self._add_elem_to_noise(index)
                            self.clusters[significan_clusters, -1] = 1
                            for not_sig_cluster in not_significan_clusters:
                                if not_sig_cluster == 0:
                                    continue

                                for bad_index in self.clusters_to_objects[not_sig_cluster]:
                                    self._add_elem_to_noise(bad_index)
                                self.clusters_to_objects[not_sig_cluster].clear()
                        else:
                            for cur_cluster in unique_clusters:
                                if cur_cluster == min_cluster:
                                    continue

                                for bad_index in self.clusters_to_objects[cur_cluster]:
                                    self._add_elem_to_exist_cluster(bad_index, distances[bad_index], min_cluster)
                                self.clusters_to_objects[cur_cluster].clear()

                            self._add_elem_to_exist_cluster(index, distances[index], min_cluster)

        return self.clean_data()

    def clean_data(self):
        unique = np.unique(self.object_labels)
        index = np.argsort(unique)
        if unique[0] != 0:
            index += 1
        true_cluster = {unq :  index for unq, index in zip(unique, index)}
        result = np.zeros(len(self.object_labels), dtype = int)
        for index, unq in enumerate(self.object_labels):
            result[index] = true_cluster[unq]
        return result

    def _add_elem_to_noise(self, index):
        self.object_labels[index] = 0
        self.clusters_to_objects[0].append(index)

    def _create_new_cluster(self, index, dist):
        self.object_labels[index] = len(self.clusters)
        self.clusters_to_objects[len(self.clusters)].append(index)
        self.clusters = np.append(self.clusters, [(dist, dist, 0)], axis = 0)

    def _add_elem_to_exist_cluster(self, index, dist, cluster_label):
        self.object_labels[index] = cluster_label
        self.clusters_to_objects[cluster_label].append(index)
        self.clusters[cluster_label][0] = min(self.clusters[cluster_label][0], dist)
        self.clusters[cluster_label][1] = max(self.clusters[cluster_label][1], dist)

## Create a vector representation based on TfidfVectorizer (on the bot texts)

In [7]:
# Stopwords
print(stopwords.words('spanish'))

['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para', 'con', 'no', 'una', 'su', 'al', 'lo', 'como', 'más', 'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí', 'porque', 'esta', 'entre', 'cuando', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'hay', 'donde', 'quien', 'desde', 'todo', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosotros', 'vosotras', 'os', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas', 'estoy', 'estás', 'está', 'estamos', 'estáis', 'están', 'e

In [8]:
def make_corpus(input_path, output_file_path):
    i = 0
    file_list = glob.glob(input_path + '*')
    
    with open(output_file_path, 'w+') as output_file:
        for file in file_list:
            if i % 500 == 0:
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                print(dt_string, '| ',  'number of processed files: ' + str(i), '| ', 
                      'percentage of completion:', str(round(i/len(file_list), 2)* 100) + ' %' )
            i+=1
            with open(file, 'r') as input_file:
                output_file.write(input_file.read().replace('\n', ' '))
                output_file.write('\n')

In [9]:
len(glob.glob('/content/drive/MyDrive/2022-01-15_Course_project/new_prep_gen_text_es/*'))

10000

In [10]:
# Let's make corpus for bot texts

make_corpus('/content/drive/MyDrive/2022-01-15_Course_project/new_prep_gen_text_es/',
            '/content/drive/MyDrive/2022-01-15_Course_project/new_TF_IDF_clustering_bot_es/dataset_generate_es.txt') 

29/01/2022 23:02:24 |  number of processed files: 0 |  percentage of completion: 0.0 %
29/01/2022 23:06:03 |  number of processed files: 500 |  percentage of completion: 5.0 %
29/01/2022 23:06:03 |  number of processed files: 1000 |  percentage of completion: 10.0 %
29/01/2022 23:06:04 |  number of processed files: 1500 |  percentage of completion: 15.0 %
29/01/2022 23:06:04 |  number of processed files: 2000 |  percentage of completion: 20.0 %
29/01/2022 23:06:05 |  number of processed files: 2500 |  percentage of completion: 25.0 %
29/01/2022 23:06:05 |  number of processed files: 3000 |  percentage of completion: 30.0 %
29/01/2022 23:06:06 |  number of processed files: 3500 |  percentage of completion: 35.0 %
29/01/2022 23:06:06 |  number of processed files: 4000 |  percentage of completion: 40.0 %
29/01/2022 23:06:07 |  number of processed files: 4500 |  percentage of completion: 45.0 %
29/01/2022 23:06:07 |  number of processed files: 5000 |  percentage of completion: 50.0 %
29/01

In [11]:
# TF-IDF corpus

def make_table_and_dict(corpus_path, min_df, max_df, token_pattern = None, use_idf = True, stop_words = 'spanish'):
    
    with open(corpus_path, 'r') as corpus_file:
        if token_pattern:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df)
        data_vectorized = vectorizer.fit_transform(corpus_file)
    return data_vectorized, vectorizer.get_feature_names(), vectorizer.idf_

In [12]:
def create_table(data_vectorized, k, name, path):
    u, sigma, vt = svds(data_vectorized, k)
    print(sigma)
    dict_ = np.dot(np.diag(sigma), vt).T
        
    with open(path + name + str(k) + '.pkl', 'wb') as f:
        pickle.dump(dict_, f)
    return dict_

In [14]:
# Vector representation on human texts (used on the bot)
f = open('/content/drive/MyDrive/2022-01-15_Course_project/TF_IDF_clustering_es/TF_IDF_cut_human_es.pkl', 'rb')
dict_cut = pickle.load(f)

In [15]:
len(dict_cut.keys())

2079

# Making n-grams and Clustering

In [16]:
from itertools import product
from math import log
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import calinski_harabasz_score

In [17]:
def divide(data, labels):
    clusters = set(labels)
    clusters_data = []
    for cluster in clusters:
        clusters_data.append(data[labels == cluster, :])
    return clusters_data

def get_centroids(clusters):
    centroids = []
    for cluster_data in clusters:
        centroids.append(cluster_data.mean(axis=0))
    return centroids

In [18]:
def cohesion(data, labels):
    clusters = sorted(set(labels))
    sse = 0
    for cluster in clusters:
        cluster_data = data[labels == cluster, :]
        centroid = cluster_data.mean(axis = 0)
        sse += ((cluster_data - centroid)**2).sum()
    return sse

def separation(data, labels, cohesion_score):
    # calculate separation as SST - SSE
    return cohesion(data, np.zeros(data.shape[0])) - cohesion_score

def SST(data):
    c = get_centroids([data])
    return ((data - c) ** 2).sum()

def SSE(clusters, centroids):
    result = 0
    for cluster, centroid in zip(clusters, centroids):
        result += ((cluster - centroid) ** 2).sum()
    return result

# Clear the store before running each time
within_cluster_dist_sum_store = {}
def within_cluster_dist_sum(cluster, centroid, cluster_id):
    if cluster_id in within_cluster_dist_sum_store:
        return within_cluster_dist_sum_store[cluster_id]
    else:
        result = (((cluster - centroid) ** 2).sum(axis=1)**.5).sum()
        within_cluster_dist_sum_store[cluster_id] = result
    return result

def RMSSTD(data, clusters, centroids):
    df = data.shape[0] - len(clusters)
    attribute_num = data.shape[1]
    return (SSE(clusters, centroids) / (attribute_num * df)) ** .5

# equal to separation / (cohesion + separation)
def RS(data, clusters, centroids):
    sst = SST(data)
    sse = SSE(clusters, centroids)
    return (sst - sse) / sst

def DB_find_max_j(clusters, centroids, i):
    max_val = 0
    max_j = 0
    for j in range(len(clusters)):
        if j == i:
            continue
        cluster_i_stat = within_cluster_dist_sum(clusters[i], centroids[i], i) / clusters[i].shape[0]
        cluster_j_stat = within_cluster_dist_sum(clusters[j], centroids[j], j) / clusters[j].shape[0]
        val = (cluster_i_stat + cluster_j_stat) / (((centroids[i] - centroids[j]) ** 2).sum() ** .5)
        if val > max_val:
            max_val = val
            max_j = j
    return max_val

def DB(data, clusters, centroids):
    result = 0
    for i in range(len(clusters)):
        result += DB_find_max_j(clusters, centroids, i)
    return result / len(clusters)

def XB(data, clusters, centroids):
    sse = SSE(clusters, centroids)
    min_dist = ((centroids[0] - centroids[1]) ** 2).sum()
    for centroid_i, centroid_j in list(product(centroids, centroids)):
        if (centroid_i - centroid_j).sum() == 0:
            continue
        dist = ((centroid_i - centroid_j) ** 2).sum()
        if dist < min_dist:
            min_dist = dist
    return sse / (data.shape[0] * min_dist)

In [19]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

# Some metrics can work for a very long time (commented out)

def get_validation_scores(data, labels, max_clust = None):
    #within_cluster_dist_sum_store.clear()
    
    clusters = divide(data, labels)
    centroids = get_centroids(clusters)
    
    scores = {}
    if max_clust:
        if len(clusters) > max_clust:
            scores['cohesion'] = cohesion(data, labels)
            scores['separation'] = separation(data, labels, scores['cohesion'])
            scores['calinski_harabaz_score'] = None
            scores['RMSSTD'] = RMSSTD(data, clusters, centroids)
            scores['RS'] = RS(data, clusters, centroids)
            #scores['DB'] = None
            #scores['XB'] = XB(data, clusters, centroids)
            scores['silhouette'] = None
        else:
            scores['cohesion'] = cohesion(data, labels)
            scores['separation'] = separation(data, labels, scores['cohesion'])
            scores['calinski_harabaz_score'] = calinski_harabasz_score(data, labels)
            scores['RMSSTD'] = RMSSTD(data, clusters, centroids)
            scores['RS'] = RS(data, clusters, centroids)
            #scores['DB'] = DB(data, clusters, centroids)
            #scores['XB'] = XB(data, clusters, centroids)
            scores['silhouette'] = silhouette_score(data, labels)
    else:
        scores['cohesion'] = cohesion(data, labels)
        scores['separation'] = separation(data, labels, scores['cohesion'])
        scores['calinski_harabaz_score'] = calinski_harabasz_score(data, labels)
        scores['RMSSTD'] = RMSSTD(data, clusters, centroids)
        scores['RS'] = RS(data, clusters, centroids)
        #scores['DB'] = DB(data, clusters, centroids)
        #scores['XB'] = XB(data, clusters, centroids)
        scores['silhouette'] = silhouette_score(data, labels)
    
    return scores

In [20]:
def make_ngrams(input_corpus,  dict_, N = 2, m = None, uniq = False):
    dict_grams = dict()
    num_ = 0
    i = 0
    j = 0
    
    print('Count documents: ', len(input_corpus))
    for sentence in input_corpus:
        sentence = sentence.split(' ')
        grams = [sentence[i:i+N] for i in range(len(sentence)-N+1)]
        for g in grams:
            g_key = '_'.join(elem for elem in g)

            if uniq:
                if all(elem in dict_.keys()  for elem in g) and (g_key not in dict_grams.keys()):
                    dict_grams[g_key] = []
                    for elem in g:
                            if m:
                                dict_grams[g_key] += list(dict_[elem][1][:m])
                            else:
                                dict_grams[g_key] += list(dict_[elem][1])
            else:
                if all(elem in dict_.keys()  for elem in g):
                    concat = []
                    for elem in g:
                        if m:
                            concat += list(dict_[elem][1][:m])
                        else:
                            concat += list(dict_[elem][1])
                    dict_grams[i] = (j, g_key, concat)
                    i += 1
            j += 1
       
            
        if num_ % 500 == 0:
            now = datetime.now()
            dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
            print(dt_string, '| ',  'number of processed documents: ' + str(num_), '| ', 
                      'percentage of completion:', str(round(num_/len(input_corpus), 2)* 100) + ' %' )
        num_ += 1
    return dict_grams

In [21]:
with open('/content/drive/MyDrive/2022-01-15_Course_project/new_TF_IDF_clustering_bot_es/dataset_generate_es.txt', 'r') as corpus_file:
    corpus = corpus_file.readlines()

In [22]:
dict_grams_bot = make_ngrams(corpus,  dict_cut, N = 2, m = 10)

Count documents:  10000
29/01/2022 23:07:59 |  number of processed documents: 0 |  percentage of completion: 0.0 %
29/01/2022 23:08:01 |  number of processed documents: 500 |  percentage of completion: 5.0 %
29/01/2022 23:08:03 |  number of processed documents: 1000 |  percentage of completion: 10.0 %
29/01/2022 23:08:05 |  number of processed documents: 1500 |  percentage of completion: 15.0 %
29/01/2022 23:08:07 |  number of processed documents: 2000 |  percentage of completion: 20.0 %
29/01/2022 23:08:09 |  number of processed documents: 2500 |  percentage of completion: 25.0 %
29/01/2022 23:08:11 |  number of processed documents: 3000 |  percentage of completion: 30.0 %
29/01/2022 23:08:12 |  number of processed documents: 3500 |  percentage of completion: 35.0 %
29/01/2022 23:08:14 |  number of processed documents: 4000 |  percentage of completion: 40.0 %
29/01/2022 23:08:16 |  number of processed documents: 4500 |  percentage of completion: 45.0 %
29/01/2022 23:08:18 |  number of

In [23]:
len(dict_grams_bot.keys())

604267

In [24]:
X0 = []
for i in dict_grams_bot.keys():
    X0.append( dict_grams_bot[i][2])

list_gramm = [dict_grams_bot[i][1] for i in dict_grams_bot.keys()]
    
X_bot = pd.DataFrame(X0)
X_bot['ind'] = dict_grams_bot.keys()
X_bot['name'] = list_gramm

In [25]:
X_bot['name'].value_counts()[:5]

a_mi          10949
llegar_a      10362
pie_ligero     6167
llevar_a       4449
matar_a        4421
Name: name, dtype: int64

In [26]:
X_bot.shape

(604267, 22)

In [27]:
X_bot.to_csv('/content/drive/MyDrive/2022-01-15_Course_project/new_TF_IDF_clustering_bot_es/n_2gramm_bot_es.csv')
X_bot.shape

(604267, 22)

In [28]:
list_col = list(X_bot.columns)
for i in ['Unnamed: 0', 'ind', 'name']:
    if i in list_col:
        list_col.remove(i)

In [29]:
#GridSearch for Clustering
grid_result = []
for sig in [1000, 100000]:
    for nei in [50, 100]:
        
        print(datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3], '| begin |', 'significance: ', sig, '| neighbors: ', nei )
        clust = Wishart(significance_level = sig, wishart_neighbors = nei)
        result = clust.fit(X_bot[list_col])
        dict_r = get_validation_scores(np.array(X_bot[list_col]), clust.object_labels, max_clust = 10000)
        dict_r['significance'] = sig
        dict_r['neighbors'] = nei
        dict_r['cluster_num'] = len(set(clust.object_labels))
        grid_result.append(dict_r)
        
        #add clustering result to table
        name_col = 'cluster_' + str(sig) + str(nei)
        X_bot[name_col] = clust.object_labels
        
        print(datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3], '| end |',  dict_r)

        X_bot.to_csv('/content/drive/MyDrive/2022-01-15_Course_project/new_TF_IDF_clustering_bot_es/n_2gramm_bot_es.csv')

2022-01-29 23:09:49.794 | begin | significance:  1000 | neighbors:  50
Start clustering




2022-01-29 23:59:22.605 | end | {'cohesion': 14485.81831072765, 'separation': 15821.372948615282, 'calinski_harabaz_score': None, 'RMSSTD': 0.039203958053728054, 'RS': 0.522033626053617, 'silhouette': None, 'significance': 1000, 'neighbors': 50, 'cluster_num': 133015}
2022-01-29 23:59:46.858 | begin | significance:  1000 | neighbors:  100
Start clustering




2022-01-30 00:55:50.298 | end | {'cohesion': 14226.32451029995, 'separation': 16080.866749042983, 'calinski_harabaz_score': None, 'RMSSTD': 0.03703666307936679, 'RS': 0.5305957457897277, 'silhouette': None, 'significance': 1000, 'neighbors': 100, 'cluster_num': 85707}
2022-01-30 00:56:16.321 | begin | significance:  100000 | neighbors:  50
Start clustering




2022-01-30 01:41:42.581 | end | {'cohesion': 14485.81831072765, 'separation': 15821.372948615282, 'calinski_harabaz_score': None, 'RMSSTD': 0.039203958053728054, 'RS': 0.522033626053617, 'silhouette': None, 'significance': 100000, 'neighbors': 50, 'cluster_num': 133015}
2022-01-30 01:42:07.493 | begin | significance:  100000 | neighbors:  100
Start clustering




2022-01-30 02:38:24.821 | end | {'cohesion': 14230.40270015136, 'separation': 16076.788559191573, 'calinski_harabaz_score': None, 'RMSSTD': 0.03704193554105013, 'RS': 0.53046118400152, 'silhouette': None, 'significance': 100000, 'neighbors': 100, 'cluster_num': 85706}
