In [2]:
import pandas as pd
import numpy as np

In [255]:
class MyAgglomerative:
    def __init__(self, n_clusters  = 3, metric = "euclidean"):
        self.n_clusters = n_clusters
        self.metric = metric
    def __str__(self):
        return f"MyAgglomerative class: n_clusters={self.n_clusters}"
    def _euclidean_distance(self, arr1, arr2):
        diff = arr1 - arr2
    def _calculate_distance(self, arr1, arr2):
        arr1 = np.atleast_2d(arr1)
        arr2 = np.atleast_2d(arr2)
        if self.metric == "euclidean":
            return np.linalg.norm(arr1 - arr2, axis = 1)
        elif self.metric == "manhattan":
            return np.sum(np.abs(arr1 - arr2), axis = 1)
        elif self.metric == "chebyshev":
            return np.max(np.abs(arr1 - arr2), axis = 1)
        elif self.metric == "cosine":
            numerator = np.sum(arr1 * arr2, axis = 1)
            denominator = np.linalg.norm(arr1, axis = 1) * np.linalg.norm(arr2, axis = 1)
            return 1 - numerator/denominator
        else:
            raise ValueError("Неправильное название метрики!")
    def fit_predict(self, X_df):
        X = X_df.to_numpy()
        n_samples, n_features = X.shape
        clusters = [[i] for i in range(n_samples)]
        centroids = X.copy()
        dist_matrix = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(i+1, n_samples):
                distance = self._calculate_distance(X[i], X[j])[0]
                dist_matrix[i, j] = distance
                dist_matrix[j, i] = distance 
                
        while len(clusters) > self.n_clusters:
            np.fill_diagonal(dist_matrix, np.inf)
            i,j = np.unravel_index(np.argmin(dist_matrix), dist_matrix.shape)
            new_cluster = clusters[i] + clusters[j]
            new_centroid = np.mean(X[new_cluster], axis = 0)
            
            for idx in sorted([i,j], reverse = True):
                dist_matrix = np.delete(dist_matrix, idx, axis = 0)
                dist_matrix = np.delete(dist_matrix, idx, axis = 1)
                clusters.pop(idx)
                centroids = np.delete(centroids, idx, axis = 0)
            clusters.append(new_cluster)
            centroids = np.vstack([centroids, new_centroid])
            new_distances = self._calculate_distance(centroids[:-1], new_centroid)
            dist_matrix = np.vstack([dist_matrix, new_distances])
            dist_matrix = np.hstack([dist_matrix, np.append(new_distances, 0).reshape(-1,1)])
        points = np.zeros(n_samples)
        for i in range(self.n_clusters):
            for j in clusters[i]:
                points[j] = i
        return points
        

In [258]:
from sklearn.datasets import load_diabetes

data = load_diabetes(as_frame=True)
X_df = data.frame
model = MyAgglomerative(n_clusters = 20, metric = "cosine")
clusters = model.fit_predict(X_df)