In [1]:
import numpy as np
import scipy.stats as sts
from UltrametricMatrix import ultramatrix
from HierarchyClusteringWithoutLogs import hierarchy
from MinMaxHierarchy import MinMaxHierarchy
from MedianHierarchy import MedianHierarchy
from Linkages import single_linked
from Linkages import complete_linked
from Linkages import group_average_linked
from Linkages import weighted_average_linked
import pandas as pd

In [2]:
def pipe(points, method, calc_metric):
    if method.__name__ == 'MinMaxHierarchy':
        logs = MinMaxHierarchy(points, metric='euclidean')
    elif method.__name__ == 'MedianHierarchy':
        logs = MedianHierarchy(points, metric='euclidean')
    else:
        logs = hierarchy(points, metric='euclidean', method=method)
    
    start_matrix = logs[1]
    finish_matrix = ultramatrix(logs[0], logs[2])

    if calc_metric == 'max_abs':
        return np.max(np.abs(start_matrix - finish_matrix))
    elif calc_metric == 'norm_sum_abs':
        n_points = len(start_matrix) 
        N_edge = n_points * (n_points - 1) / 2

        return np.sum(np.abs(finish_matrix - start_matrix)) / N_edge
    else:
        return 'give calc_metric'

In [3]:
def get_gen_sample(size):
    N = int(size / 3)

    norm1 = sts.norm(1, 0.2)
    norm2 = sts.norm(1.5, 0.1)
    norm3 = sts.norm(2, 0.2)

    x = np.append(norm1.rvs(N), np.append(norm2.rvs(N), norm3.rvs(N), axis=0), axis=0)
    y = np.append(norm1.rvs(N), np.append(norm2.rvs(N), norm3.rvs(N), axis=0), axis=0)

    points = list(zip(x, y))

    return np.array(points)

In [4]:
def times_when_method_better(results, res_column):
    res = dict()
    for col in results.columns:
        if col != res_column:
            res[col] = [results[results[col] > results[res_column]].shape[0] / results.shape[0]]

    return pd.DataFrame(res, index=[res_column])

In [5]:
MetricsFullExp = {
    'single_linked': [],
    'complete_linked': [],
    'group_average_linked': [],
    'weighted_average_linked': [],
    'min_max_linked': [],
    'median_linked': []
}

size = 150
sample_size = 5
n_iter = 10
points = get_gen_sample(size)
ssamples = []
calc_metric = 'norm_sum_abs'

for _ in range(n_iter):
    indices = np.random.choice(points.shape[0], size=sample_size, replace=False)
    sample = points[indices]
    ssamples.append(sample)

    MetricsFullExp['single_linked'].append(pipe(sample, single_linked, calc_metric))
    MetricsFullExp['complete_linked'].append(pipe(sample, complete_linked, calc_metric))
    MetricsFullExp['group_average_linked'].append(pipe(sample, group_average_linked, calc_metric))
    MetricsFullExp['weighted_average_linked'].append(pipe(sample, weighted_average_linked, calc_metric))
    MetricsFullExp['min_max_linked'].append(pipe(sample, MinMaxHierarchy, calc_metric))
    MetricsFullExp['median_linked'].append(pipe(sample, MedianHierarchy, calc_metric))

VERSION 18
Distance matrix: 0 step
          0         1         2         3         4
0  0.000000  1.593879  0.788071  0.207477  0.639608
1  1.593879  0.000000  0.811846  1.533468  0.976702
2  0.788071  0.811846  0.000000  0.759495  0.170543
3  0.207477  1.533468  0.759495  0.000000  0.641868
4  0.639608  0.976702  0.170543  0.641868  0.000000

Clusters:['0' '1' '2' '3' '4']


(2, 4)
Clusters: ['(2, 4)' '0' '1' '3']
Distance matrix: 1 step
          (2, 4)         0         1         3
(2, 4)  0.000000  0.713839  0.894274  0.700681
0       0.713839  0.000000  1.593879  0.207477
1       0.894274  1.593879  0.000000  1.533468
3       0.700681  0.207477  1.533468  0.000000



(0, 3)
Clusters: ['(0, 3)' '(2, 4)' '1']
Distance matrix: 2 step
          (0, 3)    (2, 4)         1
(0, 3)  0.000000  0.713839  1.563673
(2, 4)  0.713839  0.000000  0.894274
1       1.563673  0.894274  0.000000



((0, 3), (2, 4))
Clusters: ['((0, 3), (2, 4))' '1']
Distance matrix: 3 step
                  ((0, 3)

In [6]:
results = pd.DataFrame(MetricsFullExp)
results.head()

Unnamed: 0,single_linked,complete_linked,group_average_linked,weighted_average_linked,min_max_linked,median_linked
0,0.387824,0.356572,0.320977,0.320977,0.320977,0.320977
1,0.500619,0.510939,0.387053,0.387053,0.387053,0.387053
2,0.321525,0.497582,0.303563,0.283306,0.332905,0.282726
3,0.309869,0.385055,0.231283,0.281164,0.240378,0.230155
4,0.493411,0.387421,0.269943,0.334297,0.298924,0.265143


In [7]:
times_when_method_better(results, 'min_max_linked')
# случаи, когда максимальное значение в ультраметрической матрице МинМакс метода
# меньше, чем в остальных (1 - наилучший показатель, 0 - наихудший)

Unnamed: 0,single_linked,complete_linked,group_average_linked,weighted_average_linked,median_linked
min_max_linked,0.8,0.9,0.1,0.5,0.0


In [8]:
times_when_method_better(results, 'median_linked')
# случаи, когда максимальное значение в ультраметрической матрице Медианного метода
# меньше, чем в остальных (1 - наилучший показатель, 0 - наихудший)

Unnamed: 0,single_linked,complete_linked,group_average_linked,weighted_average_linked,min_max_linked
median_linked,0.9,1.0,0.7,0.7,0.6


In [9]:
# посмотреть почему иногда лучше МаксМин лучше Медианы и наоборот (Минимальное отклонение)
# посмотреть какие еще методы есть
# Метрика ХаусДорфа, МаксиМин (новый в статье)
# Математически и Вычислительно доказать почему МинМакс - монотонный 
# Вычислительно - доказать или опровергнуть монотонность

In [10]:
results.min_max_linked > results.median_linked

0    False
1    False
2     True
3     True
4     True
5    False
6     True
7    False
8     True
9     True
dtype: bool

In [11]:
ssamples[9]

array([[0.91364131, 1.05622085],
       [1.62710156, 1.5071381 ],
       [1.10412272, 0.9606128 ],
       [1.45140957, 1.71111666],
       [1.33897208, 1.36142123]])

In [12]:
from sklearn.metrics import pairwise_distances

pd.DataFrame(pairwise_distances(ssamples[9], metric='euclidean')) # почему медиана работает лучше

Unnamed: 0,0,1,2,3,4
0,0.0,0.844009,0.213129,0.847398,0.523501
1,0.844009,0.0,0.756437,0.269212,0.322881
2,0.213129,0.756437,0.0,0.826961,0.464545
3,0.847398,0.269212,0.826961,0.0,0.367327
4,0.523501,0.322881,0.464545,0.367327,0.0
