In [1]:
import datetime

import numpy as np
import pandas as pd

FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
CLUSTERS_PATH = 'out/clusters/'
DICTS_PATH = 'out/dicts/'

In [2]:
import pickle
from datetime import timedelta
from helper import save

import psutil
from pandarallel import pandarallel
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from distances import Distances
from clustering import Clustering, get_dists
from visualisation import Visualisation

from netgraph import Graph, InteractiveGraph, EditableGraph
import matplotlib.pyplot as plt
import json

tqdm.pandas()
from helper import *

In [2]:
class Tester:
    def __init__(self):
        return


    def dist_between(self, cluster1, cluster2, dists):
        s = 0.0
        cnt = 0
        for i in cluster1:
            for j in cluster2:
                s += self.get_dist(i, j, dists) ** 2
                cnt += 1

        if s == 0:
            return 0.01
        return np.sqrt(s) / cnt


    @staticmethod
    def tp_tn_fp_fn(clusters1, clusters2):
        elements = list(set(np.concatenate(clusters1)))
        cnt = 0
        tp, tn = 0, 0
        fn, fp = 0, 0
        for i in range(len(elements)):
            for j in range(i + 1, len(elements)):

                in_one1 = False
                in_one2 = False

                for c1 in clusters1:
                    if elements[i] in c1 and elements[j] in c1:
                        in_one1 = True
                        break

                for c2 in clusters2:
                    if elements[i] in c2 and elements[j] in c2:
                        in_one2 = True
                        break

                if in_one1 and in_one2:
                    tp += 1

                elif (not in_one1) and (not in_one2):
                    tn += 1

                elif in_one1 and (not in_one2):
                    fn += 1

                elif (not in_one1) and in_one2:
                    fp += 1

                cnt += 1

        return tp, tn, fp, fn

    @staticmethod
    def get_dist(i, j, dists):
        if (i, j) in dists:
            return dists[i, j]

        if (j, i) in dists:
            return dists[j, i]

        return 100

    @staticmethod
    def rand(tp, tn, fp, fn):
        cnt = tp + tn + fp + fn
        return (tp + tn) / cnt

    @staticmethod
    def adjusted_rand(tp, tn, fp, fn):
        n = tp + tn + fp + fn
        return (tp + tn) / n - ((tp + fp) * (tp + fn) + (fn + tn) * (fp + tn)) / (n ** 2)

    @staticmethod
    def fm(tp, tn, fp, fn):
        return tp / np.sqrt((tp + fp) * (tp + fn))

    @staticmethod
    def jaccard(tp, tn, fp, fn):
        return tp / (tp + fp + fn)

    @staticmethod
    def f1(tp, tn, fp, fn):
        p, r = tp / (tp + fp), tp / (tp + fn)
        return 2 * p * r / (p + r)

    @staticmethod
    def cohen(inter, outer):
        mu = np.mean(inter)
        d = np.mean(outer)
        return (mu - d) / (max(mu, d))


    def silhouette(self, clusters, dists):
        s = dict()
        for i1 in range(len(clusters)):
            closest_cluster = i1
            min_d = np.inf

            for i2 in range(len(clusters)):
                if i2 == i1:
                    continue
                d = self.dist_between(clusters[i1], clusters[i2], dists)
                if d < min_d:
                    min_d = d
                    closest_cluster = i2


            for j in range(len(clusters[i1])):
                sum = 0.0
                cnt = 0.0
                for j1 in range(len(clusters[i1])):
                    if j != j1:
                        sum += self.get_dist(clusters[i1][j], clusters[i1][j1], dists)
                        cnt += 1

                aj = sum / cnt
                sum = 0.0
                cnt = 0.0
                for j1 in range(len(clusters[closest_cluster])):
                    sum += self.get_dist(clusters[i1][j], clusters[closest_cluster][j1], dists)
                    cnt += 1

                bj = sum / cnt

                s[clusters[i1][j]] = (bj - aj) / max(bj, aj)

        return s

    def inter(self, clusters, dists):
        res = []
        for cluster in clusters:
            s = 0.0
            cnt = 0
            for i in range(len(cluster)):
                for j in range(i + 1, len(cluster)):
                    s += self.get_dist(i, j, dists) ** 2
                    cnt += 1
            if cnt == 0:
                res.append(s)
            else:
                res.append(np.sqrt(s) / cnt)
        return res

    def outer(self, clusters, dists):
        outer = np.full((len(clusters), len(clusters)), 0)
        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                outer[i, j] = self.dist_between(clusters[i], clusters[j], dists)
                outer[j, i] = outer[i, j]

        return outer


    @staticmethod
    def dbi(inter, outer):
        s = 0.0
        for i in range(len(inter)):
            max = -np.inf
            for j in range(len(inter)):
                if i != j:
                    dbi = (inter[i] + inter[j]) / outer[i, j]
                    if dbi > max:
                        max = dbi
            s += max
        return s / len(inter)



    @staticmethod
    def plot(stats):
        names = list(stats.keys())
        renaming = {
            'silhouette': 'Silhouette Index',
            'dbi': 'Davies–Bouldin Index',
            'cohen': 'Cohen\'s Index',
            'jaccard': 'Jaccard Coefficient',
            'fm': 'Fowlkes-Mallows Index',
            'rand': 'Rand Index',
            'adjusted_rand': 'Adjusted Rand Index',
            'wss': 'Cluster Cohesion',
            'bss': 'Cluster Separation',
            'f1': 'F1-score'
        }
        optimal = {
            'silhouette': 1,
            'dbi': 0,
            'cohen': 1,
            'jaccard': 1,
            'fm': 1,
            'rand': 1,
            'adjusted_rand': 1,
            'wss': 0,
            'bss': 100,
            'f1': 1
        }

        for metric in ['silhouette', 'jaccard', 'fm', 'rand', 'adjusted_rand', 'f1']:
            values = []
            for n in names:
                # if metric == 'silhouette':
                #     values.append(np.mean(list(stats[n][metric].values())))
                # else:
                values.append(stats[n][metric])
            plt.figure(figsize=(5, 5))
            plt.grid(zorder=0)
            plt.title(label=renaming[metric])
            plt.bar(names, values)
            plt.axhline(optimal[metric], color='red', label='optimal', linestyle='--')
            plt.xticks(rotation=45)
            plt.xlabel('Method')
            plt.ylabel('Score')
            plt.legend()
            plt.savefig(FIGURES_PATH + '___' + metric)

        for metric in ['dbi', 'cohen', 'wss', 'bss']:
            values1, values2 = [], []
            for n in names:
                values1.append(stats[n][metric][0])
                values2.append(stats[n][metric][1])

            plt.figure(figsize=(5, 5))
            plt.grid(zorder=0)
            plt.title(label=renaming[metric])

            names_axis = np.arange(len(names))
            plt.bar(names_axis - 0.2, values1, 0.4, label='First clustering')
            plt.bar(names_axis + 0.2, values2, 0.4, label='Second clustering')
            plt.xticks(names_axis, names, rotation=45)
            # plt.bar(names, values)
            plt.axhline(optimal[metric], color='red', label='optimal', linestyle='--')
            plt.xlabel('Method')
            plt.ylabel('Score')
            plt.legend()
            plt.savefig(FIGURES_PATH + '___' + metric)

        plt.figure(figsize=(10, 5))
        plt.title(label='Minimal distances between clusters')
        for n in names:
            min_dists = stats[n]['statistics']['min_distances']
            plt.plot(min_dists, label=n)
        plt.legend()
        plt.xlabel('Iteration')
        plt.ylabel('Minimal distance')
        plt.savefig(FIGURES_PATH + '___' + 'min_dists')

        plt.figure(figsize=(5, 5))
        plt.title(label='Time of clustering')
        times = []
        for n in names:
            times.append(stats[n]['statistics']['time_of_all'].seconds)
        plt.grid(zorder=0)
        plt.bar(names, times)
        plt.xticks(rotation=45)
        plt.xlabel('Method')
        plt.ylabel('Time, seconds')
        plt.legend()
        plt.savefig(FIGURES_PATH + '___' + 'time_of_clustering')


        plt.figure(figsize=(10, 5))
        plt.title(label='Times of iterations')
        for n in names:
            times = stats[n]['statistics']['time_of_iter']
            if len(times) != 0:
                if type(times[0]) != int:
                    for i in range(len(times)):
                        times[i] = times[i].microseconds
                plt.plot(times, label=n)
        plt.legend()
        plt.xlabel('Iteration')
        plt.ylabel('Time, microseconds')
        plt.savefig(FIGURES_PATH + '___' + 'times')

    @staticmethod
    def print_result(stats):
        print('Optimal:')
        print(f'\tsilhouette: 1\n\tdbi: min\n\tcohen: 1\n\tjaccard: 1\n\tfm: max\n\trand: 1\n\tadjusted_rand: 1\n\twss: 0\n\tbss: max\n\tf1: 1')

        for method in list(stats.keys()):
            print(f'Method: {method}')
            for metric in ['silhouette', 'dbi', 'cohen', 'jaccard', 'fm', 'rand', 'adjusted_rand']:#, 'wss', 'bss', 'f1']:
                if metric == 'silhouette':
                    print(f'\t{metric}: {np.mean(list(stats[method][metric].values()))}')
                else:
                    print(f'\t{metric}: {stats[method][metric]}')


    def run_statistics(self,
                       path: str = 'data_processed',
                       nrows: int = 1_000_000,
                       top_lim: int = 1_000,
                       k: int = 100,
                       methods: list[str] = None,
                       field: str = 'product_id'
                       ):

        data = pd.read_csv(DATASETS_PATH + path + '.csv', nrows=2 * nrows).drop(columns=['Unnamed: 0'])
        x_train, x_test = train_test_split(data, test_size=0.5)
        d1 = Distances(data=x_train, nrows=nrows)
        d2 = Distances(data=x_test, nrows=nrows)

        print(f'Collecting data for first...')
        pp1 = d1.get_pp(top_lim=top_lim, batch_size=100_000, field=field)
        print(f'Collecting data for second...')
        pp2 = d2.get_pp(top_lim=top_lim, batch_size=100_000, field=field)

        stats = dict()
        if methods is None:
            methods = ['min_dist', 'max_dist', 'average', 'weighted', 'ward', 'k_means']

        print(f'Starting agglomerative clustering with \n\tfile: {path},\n\tfield: {field},\n\tnrows: {nrows},\n\ttop_lim: {top_lim}')

        for m in methods:
            c = Clustering(get_dists=get_dists)
            print(f'Clustering with method {m}')
            clusters1, dists1 = c.fit(metric='euclidean', method=m, top_lim=top_lim, k=k, dists=pp1)
            clusters2, dists2 = c.fit(metric='euclidean', method=m, top_lim=top_lim, k=k, dists=pp2)

            stats[m] = dict()
            i, o = self.inter(clusters1, dists1), self.outer(clusters1, dists1)
            i2, o2 = self.inter(clusters2, dists2), self.outer(clusters2, dists2)

            stats[m]['inter'], stats[m]['outer'] = [i, i2], [o, o2]
            stats[m]['wss'] = [np.mean(i), np.mean(i2)]
            stats[m]['bss'] = [np.mean(o), np.mean(o2)]
            stats[m]['cohen'], stats[m]['dbi'] = [self.cohen(i, o), self.cohen(i2, o2)], [self.dbi(i, o), self.dbi(i2, o2)]
            stats[m]['silhouette'] = [np.mean(list(self.silhouette(clusters1, dists1).values())), np.mean(list(self.silhouette(clusters2, dists2).values()))]

            tp, tn, fp, fn = self.tp_tn_fp_fn(clusters1, clusters2)

            stats[m]['TP'], stats[m]['TN'], stats[m]['FP'], stats[m]['FN'] = tp, tn, fp, fn
            stats[m]['f1'] = self.f1(tp, tn, fp, fn)
            stats[m]['rand'] = self.rand(tp, tn, fp, fn)

            stats[m]['jaccard'], stats[m]['adjusted_rand'], stats[m]['fm'] = self.jaccard(tp, tn, fp, fn), self.adjusted_rand(tp, tn, fp, fn), self.fm(tp, tn, fp, fn)

            stats[m]['statistics'] = c.get_stats()


        return stats

In [3]:
t = Tester()
stats = t.run_statistics(nrows=1_000_000, top_lim=1000, k=200)

In [None]:
t = Tester()
t.plot(stats)

Collecting data for first...
Top of dataset length: 325284
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  0%|          | 0/4 [00:00<?, ?it/s]

Collecting data for second...
Top of dataset length: 325553
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  0%|          | 0/4 [00:00<?, ?it/s]

Starting agglomerative clustering with 
	file: data_processed,
	field: product_id,
	nrows: 1000000,
	top_lim: 1000
Clustering with method min_dist
Starting counting distances between clusters...


  0%|          | 0/1075 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/875 [00:00<?, ?it/s]

In [None]:
t.print_result(stats)