In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

In [2]:

# кластеризатор
class MaxminMethod:
    def __init__(self, X_set):
        self.X_set = X_set
        self.clusters = []
        # метрика (евклидова)
        self.metric_func = lambda x, y: np.sqrt(((x - y) ** 2).sum())

    # поулчаем кластеры с объектами
    def get_clusters(self):
        result = {}
        for cluster in self.clusters:
            result[cluster['obj_prototype_idx']] = []
        for obj_idx in self.X_set.index:
            min_dist = sys.float_info.max
            obj_prototype_idx = -1
            for cluster in self.clusters:
                dist = cluster['dist'][obj_idx]
                if dist < min_dist:
                    min_dist = cluster['dist'][obj_idx]
                    obj_prototype_idx = cluster['obj_prototype_idx']
            result[obj_prototype_idx].append(obj_idx)
        return result

    # запуск алгоритма
    def start(self):
        self.clusters.clear()
        self.clusters.append({
            'obj_prototype_idx': 0,
            'dist': self.get_series_of_dist_to_obj(self.X_set.loc[0])
        })
        while True:
            prototype_obj_idx, max_dist = self.find_next_possible_prototype()
            if max_dist <= self.get_average_dist_of_clusters():
                break
            self.clusters.append({
                'obj_prototype_idx': prototype_obj_idx,
                'dist': self.get_series_of_dist_to_obj(self.X_set.loc[prototype_obj_idx])
            })

    # находим половину среднего арифметического всех расстояний между прототипами
    def get_average_dist_of_clusters(self):
        sum = 0.0
        count = 0
        for i in range(0, len(self.clusters)):
            for j in range(i + 1, len(self.clusters)):
                obj_prototype_idx = self.clusters[j]['obj_prototype_idx']
                sum = sum + self.clusters[i]['dist'][obj_prototype_idx]
                count = count + 1
        if count == 0:
            return 0.0
        return sum / (2 * count)

    # найти следующий потенциальный прототип
    def find_next_possible_prototype(self):
        max_dist = 0.0
        prototype_obj_idx = -1
        for obj_idx in self.X_set.index:
            min_dist = sys.float_info.max
            for cluster in self.clusters:
                dist = cluster['dist'][obj_idx]
                if dist < min_dist:
                    min_dist = cluster['dist'][obj_idx]
            if min_dist > max_dist:
                max_dist = min_dist
                prototype_obj_idx = obj_idx
        return prototype_obj_idx, max_dist

    # поулчить серию расстояний от всех объектов до данного
    def get_series_of_dist_to_obj(self, obj):
        return self.X_set.apply(lambda row: self.metric_func(row, obj), axis=1)

In [3]:
# датасет
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)

# кластеризация
maxmin_method = MaxminMethod(X)
maxmin_method.start()
clusters = maxmin_method.get_clusters() #словарь: объект-прототип кластера -> [объекты кластера]

In [4]:
clusters

{0: [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49],
 118: [100,
  102,
  104,
  105,
  107,
  108,
  109,
  112,
  116,
  117,
  118,
  120,
  122,
  124,
  125,
  128,
  129,
  130,
  131,
  132,
  135,
  136,
  139,
  140,
  141,
  143,
  144,
  145],
 106: [50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  101,
  103,
  106,
  110,
  111,
  113,
  114,
  115,
  119,
  121,
  123,
  126,
  127,
  133,
  134,
  137,
  138,
  142,
  146,
  147,
  148,
  149]}

In [11]:
y[clusters[0]]

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
dtype: int32

In [12]:
y[clusters[118]]

100    2
102    2
104    2
105    2
107    2
108    2
109    2
112    2
116    2
117    2
118    2
120    2
122    2
124    2
125    2
128    2
129    2
130    2
131    2
132    2
135    2
136    2
139    2
140    2
141    2
143    2
144    2
145    2
dtype: int32

In [13]:
y[clusters[106]]

50     1
51     1
52     1
53     1
54     1
      ..
142    2
146    2
147    2
148    2
149    2
Length: 72, dtype: int32