In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

In [6]:
# кластеризатор
class KMeansMethod:
    def __init__(self, X_set, cluster_count):
        self.X_set = X_set
        self.cluster_count = cluster_count
        self.clusters = []
        # метрика (евклидова)
        self.metric_func = lambda x, y: ((x - y) ** 2).sum()
        self.delta_eps = 0.01 * 0.01

    # поулчаем кластеры с объектами
    def get_clusters(self, indexes=True):
        cluster_objs = {}
        for obj_idx, obj in self.X_set.iterrows():
            min_dist = sys.float_info.max
            cluster_idx = -1
            for (i, prototype_obj) in enumerate(self.clusters):
                dist = self.metric_func(obj, prototype_obj)
                if dist < min_dist:
                    min_dist = dist
                    cluster_idx = i
            if indexes:
                cluster_objs.setdefault(cluster_idx, []).append(obj_idx)
            else:
                cluster_objs.setdefault(cluster_idx, []).append(obj)
        return cluster_objs

    # запуск алгоритма
    def start(self):
        self.clusters.clear()
        for i in range(0, self.cluster_count):
            self.clusters.append(self.X_set.loc[i])

        while True:
            cluster_objs = self.get_clusters(False)
            end = True
            for cluster_idx, objs in cluster_objs.items():
                df = pd.DataFrame(objs)
                mean_val = df.mean()
                if self.metric_func(self.clusters[cluster_idx], mean_val) >= self.delta_eps:
                    self.clusters[cluster_idx] = mean_val
                    end = False
            if end:
                break

In [7]:
# датасет
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)

# кластеризация
k_means_method = KMeansMethod(X, 3)
k_means_method.start()
clusters = k_means_method.get_clusters() #словарь: индекс кластера -> [объекты кластера]

In [8]:
clusters

{2: [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49],
 0: [50,
  52,
  77,
  100,
  102,
  103,
  104,
  105,
  107,
  108,
  109,
  110,
  111,
  112,
  115,
  116,
  117,
  118,
  120,
  122,
  124,
  125,
  128,
  129,
  130,
  131,
  132,
  134,
  135,
  136,
  137,
  139,
  140,
  141,
  143,
  144,
  145,
  147,
  148],
 1: [51,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  101,
  106,
  113,
  114,
  119,
  121,
  123,
  126,
  127,
  133,
  138,
  142,
  146,
  149]}

In [9]:
y[clusters[0]]

50     1
52     1
77     1
100    2
102    2
103    2
104    2
105    2
107    2
108    2
109    2
110    2
111    2
112    2
115    2
116    2
117    2
118    2
120    2
122    2
124    2
125    2
128    2
129    2
130    2
131    2
132    2
134    2
135    2
136    2
137    2
139    2
140    2
141    2
143    2
144    2
145    2
147    2
148    2
dtype: int32

In [10]:
y[clusters[1]]

51     1
53     1
54     1
55     1
56     1
      ..
133    2
138    2
142    2
146    2
149    2
Length: 61, dtype: int32

In [11]:
y[clusters[2]]

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
dtype: int32