In [152]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
DICTS_PATH = 'out/dicts/'
CLUSTERS_PATH = 'out/clusters/'

In [65]:
import pandas as pd
from datetime import datetime, timedelta
import os
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
from tqdm.notebook import tqdm
from multiprocesspandas import applyparallel
from pandarallel import pandarallel
import psutil
from sys import getsizeof
import networkx as nx
from scipy.cluster.hierarchy import linkage, fcluster


from netgraph import Graph, InteractiveGraph, EditableGraph

import pickle
import gc 


tqdm.pandas()
from helper import *

In [160]:
with open(DATASETS_PATH + 'date_distances.pkl', 'rb') as f:
    dists = pickle.load(f)

In [161]:
dists
# (product_1, product_2) - [mean data distance, count, quartile range]

{(10366, 10363): [0.0, 1, 0.0],
 (10366, 10364): [0.0, 1, 0.0],
 (10366, 10365): [0.0, 1, 0.0],
 (10366, 1231): [0.0, 1, 0.0],
 (10366, 148): [0.0, 1, 0.0],
 (10366, 10367): [0.0, 1, 0.0],
 (10366, 10368): [0.0, 1, 0.0],
 (10366, 4367): [0.0, 1, 0.0],
 (10366, 5578): [0.0, 1, 0.0],
 (10366, 10369): [0.0, 1, 0.0],
 (10366, 164): [0.0, 1, 0.0],
 (148, 10363): [0.0, 1, 0.0],
 (148, 10364): [0.0, 1, 0.0],
 (148, 10365): [0.0, 1, 0.0],
 (148, 1231): [0.0, 1, 0.0],
 (148, 10366): [0.0, 1, 0.0],
 (148, 10367): [0.0, 1, 0.0],
 (148, 10368): [0.0, 1, 0.0],
 (148, 4367): [0.0, 1, 0.0],
 (148, 5578): [0.0, 3, 0.0],
 (148, 10369): [0.0, 1, 0.0],
 (10367, 10363): [0.0, 1, 0.0],
 (10367, 10364): [0.0, 1, 0.0],
 (10367, 10365): [0.0, 1, 0.0],
 (10367, 1231): [0.0, 1, 0.0],
 (10367, 10366): [0.0, 1, 0.0],
 (10367, 148): [0.0, 1, 0.0],
 (10367, 10368): [0.0, 1, 0.0],
 (10367, 4367): [0.0, 1, 0.0],
 (10367, 5578): [0.0, 1, 0.0],
 (10367, 10369): [0.0, 1, 0.0],
 (10367, 164): [0.0, 4, 0.0],
 (10368, 1036

In [162]:
def default(mean, count, scatter):
    return (mean + abs(scatter)) / (count ** 2)

def get_dists(dists, count_lower=10, dist_func=default):
    return dict([(i[0], dist_func(i[1][0], i[1][1], i[1][2])) 
                 for i in dists.items() 
                 if (i[1][1] >= count_lower 
                     or dist_func(i[1][0], i[1][1], i[1][2]) != 0) 
                 and dist_func(i[1][0], i[1][1], i[1][2]) >= 0])
    
    

In [163]:
dists = get_dists(dists, count_lower=30, dist_func=default)

In [164]:
with open(CLUSTERS_PATH + 'dists.pkl', 'wb') as f:
    pickle.dump(dists, f)

In [165]:
def k_means_clustering(dists, k, max_iterations=100):
    
    def dist_between_products(product1, product2):
        if product1 == product2:
            return 0

        if (product1, product2) in dists:
            return dists[(product1, product2)]

        if (product2, product1) in dists:
            return dists[(product2, product1)]

        return float('inf')
    
    
    def comp(product, mini):
        return dist_between_products(product, mini)
    
    
    def get_dist_between(product, cluster):
        dist = 0.0
        cnt = 0
        for c in cluster:
            if (product, c) in dists:
                dist += dists[(product, c)]
                cnt += 1
            elif (c, product) in dists:
                dist += dists[(c, product)]
                cnt += 1
        if cnt == 0:
            return float('inf')
        
        return dist / cnt
    
    def clear_clusters(clusters):
        for cluster in clusters:
        
            mini = (float('inf'), 0)
            for i, p in enumerate(cluster):
                mean_dist = get_dist_between(p, cluster)
                if mean_dist < mini[0]:
                    mini = (mean_dist, p)

            cluster = sorted(cluster, key=(lambda x: comp(x, mini[1])), reverse=False)
            cluster = [p for p in cluster if dist_between_products(p, mini[1]) < float('inf')]
        
        clusters = [i for i in clusters if len(i) > 0]
        return clusters
        
        
    products = np.unique(np.concatenate(list(dists.keys())))
    
    clusters = np.random.choice(products, k, replace=False)
    clusters = [[c] for c in clusters]
    products = products[~np.isin(products, clusters)]
    
    
    mi_break = False
    ri_break = False
    
    print('Starting products splitting to clusters...')
    for p in tqdm(products):
        p_dist = [get_dist_between(p, c) for c in clusters]
        pos = np.argmin(p_dist)
        clusters[pos].append(p)
        products = products[products != p]
            
    
    
    print('Starting operating over clusters...')
    for _ in range(max_iterations):
        
        clusters_prev = clusters
        
        for c in tqdm(clusters):
            for p in c:
                p_dist = [get_dist_between(p, c_other) for c_other in clusters]
                pos = np.argmin(p_dist)
                c.remove(p)
                clusters[pos].append(p)
                
                
        if clusters_prev == clusters:
            print('Clusters stabilizied!')
            ri_break = True
            break
            
    if not ri_break:
        print('Stopped for maximum of iterations: {}'.format(max_iterations))
    
    clusters = clear_clusters(clusters)

    
    return clusters

In [166]:
clusters = k_means_clustering(dists, k=100, max_iterations=10)

Starting products splitting to clusters...


  0%|          | 0/26309 [00:00<?, ?it/s]

Starting operating over clusters...


  0%|          | 0/100 [00:00<?, ?it/s]

Clusters stabilizied!


In [167]:
with open(CLUSTERS_PATH + 'k_means.pkl','wb') as f:
     pickle.dump(clusters, f)

In [168]:
def ward_clustering(distances, k):
    # Извлечение уникальных элементов из словаря расстояний
    elements = np.unique(list(distances.keys())[:10_000])
    
    # Создание матрицы расстояний из словаря
    num_elements = len(elements)
    
    maxi = max(list(distances.values()))
    distance_matrix = np.full((num_elements, num_elements), maxi * 100)
    
    for i in range(num_elements):
        for j in range(i + 1, num_elements):
            element1 = elements[i]
            element2 = elements[j]
            
            if (element1, element2) in distances:
                distance = distances[(element1, element2)]
                
                distance_matrix[i, j] = distance
                distance_matrix[j, i] = distance
                
            if (element2, element1) in distances:
                distance = distances[(element2, element1)]
            
                distance_matrix[i, j] = distance
                distance_matrix[j, i] = distance
    
    # Выполнение иерархической кластеризации методом Варда
    linkage_matrix = linkage(distance_matrix, method='ward')
    
    # Назначение точек кластерам
    labels = fcluster(linkage_matrix, k, criterion='maxclust')
    
    # Создание словаря с метками кластеров
    clustered_data = {element: labels[i] for i, element in enumerate(elements)}
    ans = [[] for i in range(k)]
    for i, l in enumerate(labels):
        ans[l - 1].append(elements[i])
    
    return ans

In [169]:
clusters_ward = ward_clustering(dists, k=10)

In [170]:
with open(CLUSTERS_PATH + 'ward.pkl','wb') as f:
     pickle.dump(clusters_ward, f)