In [1]:
import re
import glob
import pandas as pd
import os
import pickle
import numpy as np
from datetime import datetime
import codecs

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from scipy.sparse.linalg import svds
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
!pip install unidecode



In [4]:
import unidecode

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Wishart clustering function
# https://github.com/Radi4/BotDetection/blob/master/Wishart.py

import numpy as np
from scipy.special import gamma
from sklearn.neighbors import KDTree
from collections import defaultdict
from tqdm import tqdm

class Wishart:
    def __init__(self, wishart_neighbors, significance_level):
        self.wishart_neighbors = wishart_neighbors  # Number of neighbors
        self.significance_level = significance_level  # Significance level

    def fit(self, X):
        from sklearn.neighbors import KDTree
        kdt = KDTree(X, metric='euclidean')

        #add one because you are your neighb.
        distances, neighbors = kdt.query(X, k = self.wishart_neighbors + 1, return_distance = True)
        neighbors = neighbors[:, 1:]


        distances = distances[:, -1]
        indexes = np.argsort(distances)
        
        size, dim = X.shape

        self.object_labels = np.zeros(size, dtype = int) - 1

        #index in tuple
        #min_dist, max_dist, flag_to_significant
        self.clusters = np.array([(1., 1., 0)])
        self.clusters_to_objects = defaultdict(list)
        print('Start clustering')

        for index in indexes:
            neighbors_clusters =\
                np.concatenate([self.object_labels[neighbors[index]], self.object_labels[neighbors[index]]])
            unique_clusters = np.unique(neighbors_clusters).astype(int)
            unique_clusters = unique_clusters[unique_clusters != -1]


            if len(unique_clusters) == 0:
                self._create_new_cluster(index, distances[index])
            else:
                max_cluster = unique_clusters[-1]
                min_cluster = unique_clusters[0]
                if max_cluster == min_cluster:
                    if self.clusters[max_cluster][-1] < 0.5:
                        self._add_elem_to_exist_cluster(index, distances[index], max_cluster)
                    else:
                        self._add_elem_to_noise(index)
                else:
                    my_clusters = self.clusters[unique_clusters]
                    flags = my_clusters[:, -1]
                    if np.min(flags) > 0.5:
                        self._add_elem_to_noise(index)
                    else:
                        significan = np.power(my_clusters[:, 0], -dim) - np.power(my_clusters[:, 1], -dim)
                        significan *= self.wishart_neighbors
                        significan /= size
                        significan /= np.power(np.pi, dim / 2)
                        significan *= gamma(dim / 2 + 1)
                        significan_index = significan >= self.significance_level

                        significan_clusters = unique_clusters[significan_index]
                        not_significan_clusters = unique_clusters[~significan_index]
                        significan_clusters_count = len(significan_clusters)
                        if significan_clusters_count > 1 or min_cluster == 0:
                            self._add_elem_to_noise(index)
                            self.clusters[significan_clusters, -1] = 1
                            for not_sig_cluster in not_significan_clusters:
                                if not_sig_cluster == 0:
                                    continue

                                for bad_index in self.clusters_to_objects[not_sig_cluster]:
                                    self._add_elem_to_noise(bad_index)
                                self.clusters_to_objects[not_sig_cluster].clear()
                        else:
                            for cur_cluster in unique_clusters:
                                if cur_cluster == min_cluster:
                                    continue

                                for bad_index in self.clusters_to_objects[cur_cluster]:
                                    self._add_elem_to_exist_cluster(bad_index, distances[bad_index], min_cluster)
                                self.clusters_to_objects[cur_cluster].clear()

                            self._add_elem_to_exist_cluster(index, distances[index], min_cluster)

        return self.clean_data()

    def clean_data(self):
        unique = np.unique(self.object_labels)
        index = np.argsort(unique)
        if unique[0] != 0:
            index += 1
        true_cluster = {unq :  index for unq, index in zip(unique, index)}
        result = np.zeros(len(self.object_labels), dtype = int)
        for index, unq in enumerate(self.object_labels):
            result[index] = true_cluster[unq]
        return result

    def _add_elem_to_noise(self, index):
        self.object_labels[index] = 0
        self.clusters_to_objects[0].append(index)

    def _create_new_cluster(self, index, dist):
        self.object_labels[index] = len(self.clusters)
        self.clusters_to_objects[len(self.clusters)].append(index)
        self.clusters = np.append(self.clusters, [(dist, dist, 0)], axis = 0)

    def _add_elem_to_exist_cluster(self, index, dist, cluster_label):
        self.object_labels[index] = cluster_label
        self.clusters_to_objects[cluster_label].append(index)
        self.clusters[cluster_label][0] = min(self.clusters[cluster_label][0], dist)
        self.clusters[cluster_label][1] = max(self.clusters[cluster_label][1], dist)


class PreTrainWishart:
    def __init__(self, wishart_neighbors, significance_level, distances, neighbors):
        self.wishart_neighbors = wishart_neighbors  # Number of neighbors
        self.significance_level = significance_level  # Significance level
        self.distances = distances
        self.neighbors = neighbors

    def fit(self, X):
        from sklearn.neighbors import KDTree
        kdt = KDTree(X, metric='euclidean')

        #add one because you are your neighb.
        neighbors = self.neighbors[:, 1 : self.wishart_neighbors + 1]
        distances = self.distances[:, self.wishart_neighbors]
        indexes = np.argsort(distances)
        
        size, dim = X.shape

        self.object_labels = np.zeros(size, dtype = int) - 1

        #index in tuple
        #min_dist, max_dist, flag_to_significant
        self.clusters = np.array([(1., 1., 0)])
        self.clusters_to_objects = defaultdict(list)

        for index in indexes:
            neighbors_clusters =\
                np.concatenate([self.object_labels[neighbors[index]], self.object_labels[neighbors[index]]])
            unique_clusters = np.unique(neighbors_clusters).astype(int)
            unique_clusters = unique_clusters[unique_clusters != -1]


            if len(unique_clusters) == 0:
                self._create_new_cluster(index, distances[index])
            else:
                max_cluster = unique_clusters[-1]
                min_cluster = unique_clusters[0]
                if max_cluster == min_cluster:
                    if self.clusters[max_cluster][-1] < 0.5:
                        self._add_elem_to_exist_cluster(index, distances[index], max_cluster)
                    else:
                        self._add_elem_to_noise(index)
                else:
                    my_clusters = self.clusters[unique_clusters]
                    flags = my_clusters[:, -1]
                    if np.min(flags) > 0.5:
                        self._add_elem_to_noise(index)
                    else:
                        significan = np.power(my_clusters[:, 0], -dim) - np.power(my_clusters[:, 1], -dim)
                        significan *= self.wishart_neighbors
                        significan /= size
                        significan /= np.power(np.pi, dim / 2)
                        significan *= gamma(dim / 2 + 1)
                        significan_index = significan >= self.significance_level

                        significan_clusters = unique_clusters[significan_index]
                        not_significan_clusters = unique_clusters[~significan_index]
                        significan_clusters_count = len(significan_clusters)
                        if significan_clusters_count > 1 or min_cluster == 0:
                            self._add_elem_to_noise(index)
                            self.clusters[significan_clusters, -1] = 1
                            for not_sig_cluster in not_significan_clusters:
                                if not_sig_cluster == 0:
                                    continue

                                for bad_index in self.clusters_to_objects[not_sig_cluster]:
                                    self._add_elem_to_noise(bad_index)
                                self.clusters_to_objects[not_sig_cluster].clear()
                        else:
                            for cur_cluster in unique_clusters:
                                if cur_cluster == min_cluster:
                                    continue

                                for bad_index in self.clusters_to_objects[cur_cluster]:
                                    self._add_elem_to_exist_cluster(bad_index, distances[bad_index], min_cluster)
                                self.clusters_to_objects[cur_cluster].clear()

                            self._add_elem_to_exist_cluster(index, distances[index], min_cluster)

        return self.clean_data()

    def clean_data(self):
        unique = np.unique(self.object_labels)
        index = np.argsort(unique)
        if unique[0] != 0:
            index += 1
        true_cluster = {unq :  index for unq, index in zip(unique, index)}
        result = np.zeros(len(self.object_labels), dtype = int)
        for index, unq in enumerate(self.object_labels):
            result[index] = true_cluster[unq]
        return result

    def _add_elem_to_noise(self, index):
        self.object_labels[index] = 0
        self.clusters_to_objects[0].append(index)

    def _create_new_cluster(self, index, dist):
        self.object_labels[index] = len(self.clusters)
        self.clusters_to_objects[len(self.clusters)].append(index)
        self.clusters = np.append(self.clusters, [(dist, dist, 0)], axis = 0)

    def _add_elem_to_exist_cluster(self, index, dist, cluster_label):
        self.object_labels[index] = cluster_label
        self.clusters_to_objects[cluster_label].append(index)
        self.clusters[cluster_label][0] = min(self.clusters[cluster_label][0], dist)
        self.clusters[cluster_label][1] = max(self.clusters[cluster_label][1], dist)

## Сreate a vector representation based on TfidfVectorizer (on human texts)

In [7]:
# Stopwords for English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
def make_corpus(input_path, output_file_path):
    i = 0
    file_list = glob.glob(input_path + '*')
    
    with open(output_file_path, 'w+') as output_file:
        for file in file_list:
            if i % 500 == 0:
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                print(dt_string, '| ',  'number of processed files: ' + str(i), '| ', 
                      'percentage of completion:', str(round(i/len(file_list), 2)* 100) + ' %' )
            i+=1
            with open(file, 'r') as input_file:
                output_file.write(input_file.read().replace('\n', ' '))
                output_file.write('\n')

In [16]:
# Let's select 10k texts in a folder: '/content/drive/MyDrive/2022-01-15_Course_project/10000_cut_en/'
# Because clustering works for a very long time on large datasets

import shutil
file_list = glob.glob('/content/drive/MyDrive/2022-01-15_Course_project/cut_en/*')
k = 0

for i in file_list:
  shutil.copy(i, '/content/drive/MyDrive/2022-01-15_Course_project/10000_cut_en/')
  k += 1
  if k >= 10000:
    break


In [9]:
len(glob.glob('/content/drive/MyDrive/2022-01-15_Course_project/cut_en/*'))

14788

In [10]:
len(glob.glob('/content/drive/MyDrive/2022-01-15_Course_project/10000_cut_en/*'))

10000

In [11]:
# Let's make corpus for human-texts

make_corpus('/content/drive/MyDrive/2022-01-15_Course_project/10000_cut_en/', 
            '/content/drive/MyDrive/2022-01-15_Course_project/TF_IDF_clustering_en/10000_dataset_human_en.txt') 

22/01/2022 17:40:15 |  number of processed files: 0 |  percentage of completion: 0.0 %
22/01/2022 17:43:39 |  number of processed files: 500 |  percentage of completion: 5.0 %
22/01/2022 17:43:39 |  number of processed files: 1000 |  percentage of completion: 10.0 %
22/01/2022 17:43:40 |  number of processed files: 1500 |  percentage of completion: 15.0 %
22/01/2022 17:43:40 |  number of processed files: 2000 |  percentage of completion: 20.0 %
22/01/2022 17:43:40 |  number of processed files: 2500 |  percentage of completion: 25.0 %
22/01/2022 17:43:41 |  number of processed files: 3000 |  percentage of completion: 30.0 %
22/01/2022 17:43:41 |  number of processed files: 3500 |  percentage of completion: 35.0 %
22/01/2022 17:43:42 |  number of processed files: 4000 |  percentage of completion: 40.0 %
22/01/2022 17:43:42 |  number of processed files: 4500 |  percentage of completion: 45.0 %
22/01/2022 17:43:42 |  number of processed files: 5000 |  percentage of completion: 50.0 %
22/01

In [12]:
# TF_IDF corpus

def make_table_and_dict(corpus_path, min_df, max_df, token_pattern = None, use_idf = True, stop_words = 'english'):
    
    with open(corpus_path, 'r') as corpus_file:
        if token_pattern:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df)
        data_vectorized = vectorizer.fit_transform(corpus_file)
    return data_vectorized, vectorizer.get_feature_names(), vectorizer.idf_

In [13]:
def create_table(data_vectorized, k, name, path):
    u, sigma, vt = svds(data_vectorized, k)
    print(sigma)
    dict_ = np.dot(np.diag(sigma), vt).T
        
    with open(path + name + str(k) + '.pkl', 'wb') as f:
        pickle.dump(dict_, f)
    return dict_

In [14]:
#create TF_IDF on human text
en_data_vectorized, en_dictionary, idfs = make_table_and_dict('/content/drive/MyDrive/2022-01-15_Course_project/TF_IDF_clustering_en/10000_dataset_human_en.txt', 
                                                                0.05,  0.7 , token_pattern = '[A-Za-z]+', 
                                                                stop_words = stopwords.words('english'))



In [15]:
dict_ = create_table(en_data_vectorized, 100, '10000_SVD_human_en_', 
                     '/content/drive/MyDrive/2022-01-15_Course_project/TF_IDF_clustering_en/')

[ 1.38702534  1.38991427  1.39921949  1.40407253  1.40515473  1.41273601
  1.41685925  1.42255831  1.42512507  1.43312768  1.44899768  1.45360187
  1.45775257  1.46675168  1.46970726  1.48272918  1.48541388  1.49475139
  1.49852761  1.51090422  1.51314042  1.52491342  1.53397807  1.55123462
  1.56130332  1.56708245  1.5771239   1.58572565  1.5987485   1.60246324
  1.61639271  1.62697481  1.63935747  1.65498008  1.66925057  1.67444444
  1.67578791  1.69653738  1.70503331  1.71010821  1.73648524  1.74197619
  1.7527861   1.77473808  1.7914766   1.80187569  1.8174394   1.82934986
  1.85557111  1.85997274  1.87772936  1.89107364  1.91973095  1.92822272
  1.96137951  1.97213453  1.99313352  2.01216444  2.04662524  2.06448466
  2.0765497   2.10583484  2.12314859  2.14921483  2.18436206  2.22654352
  2.23527498  2.25754432  2.30579807  2.3501784   2.37064469  2.41853709
  2.47262482  2.48654544  2.54329048  2.63465823  2.66614169  2.70165518
  2.74540376  2.77135995  2.82950277  2.97132028  3

In [16]:
pairs_0 = list(zip(idfs, dict_))
pairs_idf = dict(zip(en_dictionary, pairs_0))

In [17]:
try:
    file = open('/content/drive/MyDrive/2022-01-15_Course_project/TF_IDF_clustering_en/10000_TF_IDF_human_en.pkl', 'wb')
    pickle.dump(pairs_idf, file)
    file.close()
except:
    print("Something went wrong")

In [18]:
# Removing frequently used words
dict_cut = dict()
for w in pairs_idf.keys():
    if pairs_idf[w][0] > 1.5:
        dict_cut[w] = pairs_idf[w]

In [19]:
try:
    file = open('/content/drive/MyDrive/2022-01-15_Course_project/TF_IDF_clustering_en/10000_TF_IDF_cut_human_en.pkl', 'wb')
    pickle.dump(dict_cut, file)
    file.close()
except:
    print("Something went wrong")

In [20]:
len(en_dictionary)

1607

In [21]:
len(dict_cut.keys())

1528

# Making n-grams and Clustering

In [22]:
from itertools import product
from math import log
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import calinski_harabasz_score

In [23]:
def divide(data, labels):
    clusters = set(labels)
    clusters_data = []
    for cluster in clusters:
        clusters_data.append(data[labels == cluster, :])
    return clusters_data

def get_centroids(clusters):
    centroids = []
    for cluster_data in clusters:
        centroids.append(cluster_data.mean(axis=0))
    return centroids

In [24]:
def cohesion(data, labels):
    clusters = sorted(set(labels))
    sse = 0
    for cluster in clusters:
        cluster_data = data[labels == cluster, :]
        centroid = cluster_data.mean(axis = 0)
        sse += ((cluster_data - centroid)**2).sum()
    return sse

def separation(data, labels, cohesion_score):
    # calculate separation as SST - SSE
    return cohesion(data, np.zeros(data.shape[0])) - cohesion_score

def SST(data):
    c = get_centroids([data])
    return ((data - c) ** 2).sum()

def SSE(clusters, centroids):
    result = 0
    for cluster, centroid in zip(clusters, centroids):
        result += ((cluster - centroid) ** 2).sum()
    return result

# Clear the store before running each time
within_cluster_dist_sum_store = {}
def within_cluster_dist_sum(cluster, centroid, cluster_id):
    if cluster_id in within_cluster_dist_sum_store:
        return within_cluster_dist_sum_store[cluster_id]
    else:
        result = (((cluster - centroid) ** 2).sum(axis=1)**.5).sum()
        within_cluster_dist_sum_store[cluster_id] = result
    return result

def RMSSTD(data, clusters, centroids):
    df = data.shape[0] - len(clusters)
    attribute_num = data.shape[1]
    return (SSE(clusters, centroids) / (attribute_num * df)) ** .5

# equal to separation / (cohesion + separation)
def RS(data, clusters, centroids):
    sst = SST(data)
    sse = SSE(clusters, centroids)
    return (sst - sse) / sst

def DB_find_max_j(clusters, centroids, i):
    max_val = 0
    max_j = 0
    for j in range(len(clusters)):
        if j == i:
            continue
        cluster_i_stat = within_cluster_dist_sum(clusters[i], centroids[i], i) / clusters[i].shape[0]
        cluster_j_stat = within_cluster_dist_sum(clusters[j], centroids[j], j) / clusters[j].shape[0]
        val = (cluster_i_stat + cluster_j_stat) / (((centroids[i] - centroids[j]) ** 2).sum() ** .5)
        if val > max_val:
            max_val = val
            max_j = j
    return max_val

def DB(data, clusters, centroids):
    result = 0
    for i in range(len(clusters)):
        result += DB_find_max_j(clusters, centroids, i)
    return result / len(clusters)

def XB(data, clusters, centroids):
    sse = SSE(clusters, centroids)
    min_dist = ((centroids[0] - centroids[1]) ** 2).sum()
    for centroid_i, centroid_j in list(product(centroids, centroids)):
        if (centroid_i - centroid_j).sum() == 0:
            continue
        dist = ((centroid_i - centroid_j) ** 2).sum()
        if dist < min_dist:
            min_dist = dist
    return sse / (data.shape[0] * min_dist)

In [25]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

# Some metrics can work for a very long time (commented out)

def get_validation_scores(data, labels, max_clust = None):
    #within_cluster_dist_sum_store.clear()
    
    clusters = divide(data, labels)
    centroids = get_centroids(clusters)
    
    scores = {}
    if max_clust:
        if len(clusters) > max_clust:
            scores['cohesion'] = cohesion(data, labels)
            scores['separation'] = separation(data, labels, scores['cohesion'])
            scores['calinski_harabaz_score'] = None
            scores['RMSSTD'] = RMSSTD(data, clusters, centroids)
            scores['RS'] = RS(data, clusters, centroids)
            #scores['DB'] = None
            #scores['XB'] = XB(data, clusters, centroids)
            scores['silhouette'] = None
        else:
            scores['cohesion'] = cohesion(data, labels)
            scores['separation'] = separation(data, labels, scores['cohesion'])
            scores['calinski_harabaz_score'] = calinski_harabasz_score(data, labels)
            scores['RMSSTD'] = RMSSTD(data, clusters, centroids)
            scores['RS'] = RS(data, clusters, centroids)
            #scores['DB'] = DB(data, clusters, centroids)
            #scores['XB'] = XB(data, clusters, centroids)
            scores['silhouette'] = silhouette_score(data, labels)
    else:
        scores['cohesion'] = cohesion(data, labels)
        scores['separation'] = separation(data, labels, scores['cohesion'])
        scores['calinski_harabaz_score'] = calinski_harabasz_score(data, labels)
        scores['RMSSTD'] = RMSSTD(data, clusters, centroids)
        scores['RS'] = RS(data, clusters, centroids)
        #scores['DB'] = DB(data, clusters, centroids)
        #scores['XB'] = XB(data, clusters, centroids)
        scores['silhouette'] = silhouette_score(data, labels)
    
    return scores

In [26]:
def make_ngrams(input_corpus,  dict_, N = 2, m = None, uniq = False):
    dict_grams = dict()
    num_ = 0
    i = 0
    j = 0
    
    print('Count documents: ', len(input_corpus))
    for sentence in input_corpus:
        sentence = sentence.split(' ')
        grams = [sentence[i:i+N] for i in range(len(sentence)-N+1)]
        for g in grams:
            g_key = '_'.join(elem for elem in g)

            if uniq:
                if all(elem in dict_.keys()  for elem in g) and (g_key not in dict_grams.keys()):
                    dict_grams[g_key] = []
                    for elem in g:
                            if m:
                                dict_grams[g_key] += list(dict_[elem][1][:m])
                            else:
                                dict_grams[g_key] += list(dict_[elem][1])
            else:
                if all(elem in dict_.keys()  for elem in g):
                    concat = []
                    for elem in g:
                        if m:
                            concat += list(dict_[elem][1][:m])
                        else:
                            concat += list(dict_[elem][1])
                    dict_grams[i] = (j, g_key, concat)
                    i += 1
            j += 1
       
            
        if num_ % 500 == 0:
            now = datetime.now()
            dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
            print(dt_string, '| ',  'number of processed documents: ' + str(num_), '| ', 
                      'percentage of completion:', str(round(num_/len(input_corpus), 2)* 100) + ' %' )
        num_ += 1
    return dict_grams

In [27]:
with open('/content/drive/MyDrive/2022-01-15_Course_project/TF_IDF_clustering_en/10000_dataset_human_en.txt', 'r') as corpus_file:
    corpus = corpus_file.readlines()

In [28]:
dict_grams_human = make_ngrams(corpus,  dict_cut, N = 2, m = 10)

Count documents:  10000
22/01/2022 18:00:12 |  number of processed documents: 0 |  percentage of completion: 0.0 %
22/01/2022 18:00:14 |  number of processed documents: 500 |  percentage of completion: 5.0 %
22/01/2022 18:00:16 |  number of processed documents: 1000 |  percentage of completion: 10.0 %
22/01/2022 18:00:18 |  number of processed documents: 1500 |  percentage of completion: 15.0 %
22/01/2022 18:00:20 |  number of processed documents: 2000 |  percentage of completion: 20.0 %
22/01/2022 18:00:22 |  number of processed documents: 2500 |  percentage of completion: 25.0 %
22/01/2022 18:00:24 |  number of processed documents: 3000 |  percentage of completion: 30.0 %
22/01/2022 18:00:26 |  number of processed documents: 3500 |  percentage of completion: 35.0 %
22/01/2022 18:00:28 |  number of processed documents: 4000 |  percentage of completion: 40.0 %
22/01/2022 18:00:30 |  number of processed documents: 4500 |  percentage of completion: 45.0 %
22/01/2022 18:00:32 |  number of

In [29]:
len(dict_grams_human.keys())

538810

In [30]:
X0 = []
for i in dict_grams_human.keys():
    X0.append( dict_grams_human[i][2])

list_gramm = [dict_grams_human[i][1] for i in dict_grams_human.keys()]
    
X_human = pd.DataFrame(X0)
X_human['ind'] = dict_grams_human.keys()
X_human['name'] = list_gramm

In [31]:
X_human['name'].value_counts()[:5]

o_clock         1186
every_thing     1002
pass_through     783
sit_down         650
slave_trade      641
Name: name, dtype: int64

In [32]:
X_human.shape

(538810, 22)

In [33]:
X_human.to_csv('/content/drive/MyDrive/2022-01-15_Course_project/TF_IDF_clustering_en/10000_n_2gramm_human_en.csv')
X_human.shape

(538810, 22)

In [34]:
list_col = list(X_human.columns)
for i in ['Unnamed: 0', 'ind', 'name']:
    if i in list_col:
        list_col.remove(i)

In [35]:
#GridSearch for Clustering
grid_result = []
for sig in [1000, 100000]:
    for nei in [50, 100]:
        
        print(datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3], '| begin |', 'significance: ', sig, '| neighbors: ', nei )
        clust = Wishart(significance_level = sig, wishart_neighbors = nei)
        result = clust.fit(X_human[list_col])
        dict_r = get_validation_scores(np.array(X_human[list_col]), clust.object_labels, max_clust = 10000)
        dict_r['significance'] = sig
        dict_r['neighbors'] = nei
        dict_r['cluster_num'] = len(set(clust.object_labels))
        grid_result.append(dict_r)
        
        #add clustering result to table
        name_col = 'cluster_' + str(sig) + str(nei)
        X_human[name_col] = clust.object_labels
        
        print(datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3], '| end |',  dict_r)

        X_human.to_csv('/content/drive/MyDrive/2022-01-15_Course_project/TF_IDF_clustering_en/10000_n_2gramm_human_en.csv')

2022-01-22 18:03:35.608 | begin | significance:  1000 | neighbors:  50
Start clustering




2022-01-22 21:02:59.145 | end | {'cohesion': 8571.319560256108, 'separation': 12140.398810480781, 'calinski_harabaz_score': 195.9886519420432, 'RMSSTD': 0.028304474174194103, 'RS': 0.5861608676387597, 'silhouette': -0.1965591040947928, 'significance': 1000, 'neighbors': 50, 'cluster_num': 3867}
2022-01-22 21:03:21.913 | begin | significance:  1000 | neighbors:  100
Start clustering




2022-01-23 00:37:20.579 | end | {'cohesion': 8146.544521532364, 'separation': 12565.173849204526, 'calinski_harabaz_score': 685.8482355045229, 'RMSSTD': 0.027525935621499728, 'RS': 0.6066697907092811, 'silhouette': -0.137816160048998, 'significance': 1000, 'neighbors': 100, 'cluster_num': 1210}
2022-01-23 00:37:43.757 | begin | significance:  100000 | neighbors:  50
Start clustering




2022-01-23 03:35:54.893 | end | {'cohesion': 8571.319560256108, 'separation': 12140.398810480781, 'calinski_harabaz_score': 195.9886519420432, 'RMSSTD': 0.028304474174194103, 'RS': 0.5861608676387597, 'silhouette': -0.1965591040947928, 'significance': 100000, 'neighbors': 50, 'cluster_num': 3867}
2022-01-23 03:36:17.993 | begin | significance:  100000 | neighbors:  100
Start clustering




2022-01-23 07:10:09.765 | end | {'cohesion': 8146.544521532364, 'separation': 12565.173849204526, 'calinski_harabaz_score': 685.8482355045229, 'RMSSTD': 0.027525935621499728, 'RS': 0.6066697907092811, 'silhouette': -0.137816160048998, 'significance': 100000, 'neighbors': 100, 'cluster_num': 1210}
