In [1]:
import math

FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
DICTS_PATH = 'out/dicts/'
CLUSTERS_PATH = 'out/clusters/'

In [2]:
import pandas as pd
from datetime import datetime, timedelta
import os
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
from tqdm.notebook import tqdm
from multiprocesspandas import applyparallel
from pandarallel import pandarallel
import psutil
from sys import getsizeof
import networkx as nx
from scipy.cluster.hierarchy import linkage, fcluster


from netgraph import Graph, InteractiveGraph, EditableGraph

import pickle
import gc 


tqdm.pandas()
from helper import *

In [3]:
# with open(DATASETS_PATH + 'date_distances.pkl', 'rb') as f:
#     dists = pickle.load(f)

In [4]:
# with open(DATASETS_PATH + 'user_purchases.pkl', 'rb') as f:
#     dists = pickle.load(f)

In [5]:
# dists

In [6]:
# dict(sorted(list(dists.items()), key=(lambda x: x[1][1]), reverse=True))

In [7]:
# list(dists.items()).sort(key=(lambda x: x[1][0]))
# (product_1, product_2) - [mean data distance, count, quartile range]

In [8]:
def default(mean, count, scatter):
    return (mean + abs(scatter)) / (count ** 2)

def get_dists(dists, count_lower=10, dist_func=default):
    return dict([(i[0], dist_func(i[1][0], i[1][1], i[1][2])) 
                 for i in dists.items() 
                 if (i[1][1] >= count_lower
                     or dist_func(i[1][0], i[1][1], i[1][2]) != 0) 
                 and dist_func(i[1][0], i[1][1], i[1][2]) >= 0])
    
    

In [9]:
# dists = get_dists(dists, count_lower=30, dist_func=default)

In [10]:
# with open(CLUSTERS_PATH + 'dists.pkl', 'wb') as f:
#     pickle.dump(dists, f)

In [11]:
def k_means_clustering(dists, k, max_iterations=100):
    
    def dist_between_products(product1, product2):
        if product1 == product2:
            return 0

        if (product1, product2) in dists:
            return dists[(product1, product2)]

        if (product2, product1) in dists:
            return dists[(product2, product1)]

        return float('inf')
    
    
    def comp(product, mini):
        return dist_between_products(product, mini)
    
    
    def get_dist_between(product, cluster):
        dist = 0.0
        cnt = 0
        for c in cluster:
            if (product, c) in dists:
                dist += dists[(product, c)]
                cnt += 1
            elif (c, product) in dists:
                dist += dists[(c, product)]
                cnt += 1
        if cnt == 0:
            return float('inf')
        
        return dist / cnt
    
    def clear_clusters(clusters):
        for cluster in clusters:
        
            mini = (float('inf'), 0)
            for i, p in enumerate(cluster):
                mean_dist = get_dist_between(p, cluster)
                if mean_dist < mini[0]:
                    mini = (mean_dist, p)

            cluster = sorted(cluster, key=(lambda x: comp(x, mini[1])), reverse=False)
            cluster = [p for p in cluster if dist_between_products(p, mini[1]) < float('inf')]
        
        clusters = [i for i in clusters if len(i) > 0]
        return clusters
        
        
    products = np.unique(np.concatenate(list(dists.keys())))
    
    clusters = np.random.choice(products, k, replace=False)
    clusters = [[c] for c in clusters]
    products = products[~np.isin(products, clusters)]
    
    
    mi_break = False
    ri_break = False
    
    print('Starting products splitting to clusters...')
    for p in tqdm(products):
        p_dist = [get_dist_between(p, c) for c in clusters]
        pos = np.argmin(p_dist)
        clusters[pos].append(p)
        products = products[products != p]
            
    
    
    print('Starting operating over clusters...')
    for _ in range(max_iterations):
        
        clusters_prev = clusters
        
        for c in tqdm(clusters):
            for p in c:
                p_dist = [get_dist_between(p, c_other) for c_other in clusters]
                pos = np.argmin(p_dist)
                c.remove(p)
                clusters[pos].append(p)
                
                
        if clusters_prev == clusters:
            print('Clusters stabilizied!')
            ri_break = True
            break
            
    if not ri_break:
        print('Stopped for maximum of iterations: {}'.format(max_iterations))
    
    clusters = clear_clusters(clusters)

    
    return clusters

In [12]:
# clusters = k_means_clustering(dists, k=100, max_iterations=10)

In [13]:
# with open(CLUSTERS_PATH + 'k_means.pkl','wb') as f:
#      pickle.dump(clusters, f)

In [14]:
# clusters_ward = ward_clustering(dists, k=10)

In [15]:
# with open(CLUSTERS_PATH + 'ward.pkl','wb') as f:
#      pickle.dump(clusters_ward, f)

In [23]:
class Metric:
    def __init__(self, method='euclidean', max=100):
        self.method = method
        self.max = max

    def run(self, cluster1, cluster2, dists):
        self.cluster1 = cluster1
        self.cluster2 = cluster2
        self.dists = dists

        if self.method == 'euclidean':
            return self.euclidean()
        if self.method == 'min_dist':
            return self.min_dist()
        if self.method == 'max_dist':
            return self.max_dist()
        if self.method == 'average':
            return self.average()
        if self.method == 'ward':
            return self.ward()

    def _get(self, i, j):
        if i == j:
            return 0.0
        if (i, j) in self.dists:
            return self.dists[(i, j)]
        if (j, i) in self.dists:
            return self.dists[(j, i)]
        return self.max


    def euclidean(self):
        n1, n2 = len(self.cluster1), len(self.cluster2)
        s = 0.0
        for i in self.cluster1:
            for j in self.cluster2:
                s += self._get(i, j) ** 2
        return np.sqrt(s)


    def min_dist(self):
        s, mini = 0.0, self.max + 1
        for i in self.cluster1:
            for j in self.cluster2:
                s = self._get(i, j)

                if s < mini:
                    mini = s
        return mini


    def max_dist(self):
        s, maxi = 0.0, -1.0
        for i in self.cluster1:
            for j in self.cluster2:
                s = self._get(i, j)

                if s > maxi:
                    maxi = s
        return maxi


    def average(self):
        n1, n2 = len(self.cluster1), len(self.cluster2)
        s = 0.0
        for i in self.cluster1:
            for j in self.cluster2:
                s += self._get(i, j)
        return s / (n1 * n2)


    def ward(self):
        n1, n2 = len(self.cluster1), len(self.cluster2)
        s_u, s_1, s_2 = 0.0, 0.0, 0.0
        for i in self.cluster1:
            for j in self.cluster2:
                s_u += self._get(i, j) ** 2

        for i in range(n1):
            for j in range(i + 1, n1):
                s_1 += self._get(self.cluster1[i], self.cluster1[j])

        for i in range(n2):
            for j in range(i + 1, n2):
                s_2 += self._get(self.cluster2[i], self.cluster2[j])
        return (s_u - s_1 - s_2) / (n1 + n2)

In [36]:
class Clustering:
    def __init__(self, get_dists=get_dists):
        self.get_dists = get_dists
        self.statistics = {
            'min_distances': [],
            'time_of_iter': [],
            'time_of_all': 0.0,
            'count_of_iters': 0.0,
            }

    def get_stats(self):
        self.statistics['time_of_iter'] = np.array(self.statistics['time_of_iter']).mean()
        for k in self.statistics.keys():
            print(f"{k} --- {self.statistics[k]}")
        return self.statistics

    @staticmethod
    def _merge_clusters(cluster1, cluster2):
        merged_cluster = cluster1 + cluster2
        return merged_cluster

    def run(self, dists, k):
        start0 = datetime.now()

        elements = np.unique(list(dists.keys())[:100_000])
        # elements = list(set(list(np.concatenate(dists.keys())[:10_000])))
        clusters = [[i] for i in elements]
        iters = len(elements) - k

        clusters_dists = np.full((len(elements), len(elements)), -1.0)

        print('Starting counting distances between clusters...')
        for i in tqdm(range(len(clusters))):
            for j in range(i + 1, len(clusters)):
                distance = self.metric.run(clusters[i], clusters[j], dists)
                clusters_dists[i][j] = distance
                clusters_dists[j][i] = distance

        print('Starting collapsing closest clusters...')
        for _ in tqdm(range(iters)):
            start = datetime.now()

            a = np.argmin(clusters_dists)
            i, j = a // clusters_dists.shape[1], a % clusters_dists.shape[1]

            min_distance = clusters_dists[i, j]
            merged_cluster = self._merge_clusters(clusters[i], clusters[j])
            del clusters[j]
            del clusters[i]
            clusters_dists[i, :] = np.inf
            clusters_dists[j, :] = np.inf
            clusters_dists[:, i] = np.inf
            clusters_dists[:, j] = np.inf

            clusters.append(merged_cluster)

            j = len(clusters) - 1
            for i in range(len(clusters)):
                distance = self.metric.run(clusters[i], clusters[j], dists)
                clusters_dists[i][j], clusters_dists[j][i] = distance, distance

            self.statistics['min_distances'].append(min_distance)
            self.statistics['time_of_iter'].append(datetime.now() - start)
        self.statistics['count_of_iters'] = iters
        self.statistics['time_of_all'] = datetime.now() - start0

        return clusters

    def run_k_means(self, dists, k, max_iter=10_000):
        # Можно наканпливать minimal_dist, как внутрикластерное расстояние (в агломеративных тоже)
        # Можно сохранять среднее расстояние между кластерами и внутри кластеров, чтобы показывать на графике

        start0 = datetime.now()
        elements = np.unique(sorted(list(dists.keys()))[:25_000])

        clusters = np.random.choice(elements, k, replace=False)
        elements = elements[~np.isin(elements, clusters)]
        clusters = [[c] for c in clusters]


        print('Starting elements splitting by clusters...')
        for e in tqdm(elements):
            minimal_dist = self.metric.max * 5.0
            cluster_index = 0
            for i, c in enumerate(clusters):
                dist = self.metric.run([e], c, dists)
                if dist < minimal_dist:
                    minimal_dist = dist
                    cluster_index = i

            clusters[cluster_index].append(e)


        print('Starting operating over clusters...')
        for _ in tqdm(range(max_iter)):
            self.statistics['count_of_iters'] += 1
            start = datetime.now()
            prev_clusters = clusters.copy()
            for c1 in clusters:
                for pos_el, el in enumerate(c1):
                    minimal_dist = self.metric.max * 5.0
                    cluster_index = 0
                    for i, c in enumerate(clusters):
                        dist = self.metric.run([el], c, dists)
                        if dist < minimal_dist:
                            minimal_dist = dist
                            cluster_index = i

                    self.statistics['min_distances'].append(minimal_dist)

                    del c1[pos_el]
                    clusters[cluster_index].append(el)

            if prev_clusters == clusters:
                print('Clusters stabilizied!')
                self.statistics['time_of_all'] = datetime.now() - start0
                return clusters

            self.statistics['time_of_iter'].append(datetime.now() - start)

        print('Stopped for maximum iterations: {}'.format(max_iter))
        self.statistics['time_of_all'] = datetime.now() - start0
        return clusters


    def fit(self, metric, type='aglomerative', dists_path='date_distances', k=10, max_iter=10_000):
        with open(DATASETS_PATH + dists_path + '.pkl', 'rb') as f:
            self.dists = pickle.load(f)

        print('clustering...')

        dists = self.get_dists(self.dists)

        self.metric = metric

        if type == 'aglomerative':
            return self.run(dists, k)
        else:
            return self.run_k_means(dists, k, max_iter)

In [37]:
c = Clustering()
# clusters_euc = c.fit(metric=Metric('max_dist'), type='k_means', k=10_000, max_iter=10_000)
#
# with open(CLUSTERS_PATH + 'k_means_max_dist.pkl','wb') as f:
#      pickle.dump(clusters_euc, f)

In [None]:
clusters_euc = c.fit(metric=Metric('max_dist'), type='aglomerative', k=10_000, max_iter=10_000)


with open(CLUSTERS_PATH + 'ward_max_dist.pkl','wb') as f:
     pickle.dump(clusters_euc, f)


clustering...
Starting counting distances between clusters...


  0%|          | 0/12201 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/2201 [00:00<?, ?it/s]

In [35]:
c.get_stats()

min_distances --- []
time_of_iter --- nan
time_of_all --- 0:00:05.075247
count_of_iters --- -7711


  self.statistics['time_of_iter'] = np.array(self.statistics['time_of_iter']).mean()
  ret = ret.dtype.type(ret / rcount)


{'min_distances': [],
 'time_of_iter': nan,
 'time_of_all': datetime.timedelta(seconds=5, microseconds=75247),
 'count_of_iters': -7711}

In [None]:
# with open(CLUSTERS_PATH + 'k_means_euclidean.pkl','wb') as f:
#      pickle.dump(clusters_euc, f)