In [8]:
import pandas as pd
import numpy as np
import random

In [227]:
class MyKMeans:
    def __init__(self, n_clusters=3, max_iter=10, n_init=3, random_state=42):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.n_init = n_init
        self.random_state = random_state
        self.inertia_ = None
        self.cluster_centers_  = None
    def __str__(self):
        return f"MyKMeans class: n_clusters={self.n_clusters}, max_iter={self.max_iter}, n_init={self.n_init}, random_state={self.random_state}"

    def _euclidean_distance(self, centroids, points):
        diff = points - centroids
        return np.sqrt(np.sum(diff**2, axis = 1))
        
    def _calculate_wcss(self, centroid, points):
        return np.sum((points - centroid) ** 2)
        
    def fit(self, df_X):
        
        np.random.seed(self.random_state)
        if isinstance(df_X, pd.DataFrame):
            X = df_X.to_numpy()
        else:
            X = np.asarray(df_X)
            
        n_samples, n_features = X.shape
        all_centroids = []
        all_wcss = []
        for n in range(self.n_init):
            centroids = np.zeros((self.n_clusters, n_features))
            for i in range(self.n_clusters):
                for j in range(n_features):
                    min_ = np.min(X[:, j])
                    max_ = np.max(X[:, j])
                    centroids[i,j] = np.random.uniform(min_, max_)
            for it in range(self.max_iter):
                old_centroids = centroids.copy()
                distances = np.zeros((n_samples, self.n_clusters))
                for i in range(self.n_clusters):
                   distances[:, i] = self._euclidean_distance(centroids[i], X)
                clusters = np.argmin(distances, axis = 1)
                
                new_centroids = np.zeros_like(centroids)
                for i in range(self.n_clusters):
                    mask = clusters == i
                    if np.sum(mask) > 0:
                        cluster_points = X[mask]
                        new_centroids[i] = np.mean(cluster_points, axis = 0)
                    else:
                        new_centroids[i] = old_centroids[i]
                
                if np.allclose(old_centroids, new_centroids, atol = 1e-4, rtol = 1e-12):
                    break
                centroids = new_centroids.copy()
            
            final_distances = np.zeros((n_samples, self.n_clusters))
            for i in range(self.n_clusters):
               final_distances[:, i] = self._euclidean_distance(centroids[i], X)
            clusters = np.argmin(final_distances, axis = 1)
            
            wcss_sum = 0
            for r in range(self.n_clusters):
                mask = clusters == r
                wcss_sum += self._calculate_wcss(centroids[r], X[mask])
                
            all_wcss.append(wcss_sum)   
            all_centroids.append(centroids)
    
        best_iteration = np.argmin(all_wcss)
        self.cluster_centers_  = all_centroids[best_iteration]
        self.inertia_ = all_wcss[best_iteration]
    def predict(self, df_X):
        if isinstance(df_X, pd.DataFrame):
            X_test = df_X.to_numpy()
        else:
            X_test = np.asarray(df_X)
        n_samples, n_features = X_test.shape
        distances = np.zeros((n_samples, self.n_clusters))
        for i in range(self.n_clusters):
            distances[:, i] = self._euclidean_distance(self.cluster_centers_[i], X_test)
        clusters = np.argmin(distances, axis = 1)
        return clusters
        
        

In [228]:
model = MyKMeans(n_clusters = 10, max_iter = 10, n_init = 3)

In [229]:
X_df = pd.DataFrame({
    'feature_1': np.linspace(0.9, 9.0, 100),  # 30 равномерно распределенных значений от 0.9 до 9.0
    'feature_2': np.arange(0.5, 9.3, (9.3-0.5)/100)[:100],  # 30 значений от 0.5 до ~9.3
    'feature_3': np.linspace(10, 500, 100),  # 30 значений от 10 до 100
    'feature_4': np.random.randn(100),  # 30 значений из нормального распределения
    'feature_5': np.random.randint(0, 1000, 100)  # 30 случайных целых от 0 до 99
})
X_test = pd.DataFrame({
    'feature_1': np.linspace(0.9, 9.0, 20),  # 30 равномерно распределенных значений от 0.9 до 9.0
    'feature_2': np.arange(0.5, 9.3, (9.3-0.5)/20)[:20],  # 30 значений от 0.5 до ~9.3
    'feature_3': np.linspace(10, 500, 20),  # 30 значений от 10 до 100
    'feature_4': np.random.randn(20),  # 30 значений из нормального распределения
    'feature_5': np.random.randint(0, 1000, 20)  # 30 случайных целых от 0 до 99
})

In [231]:
model.fit(X_df)
print(model.cluster_centers_)

[[ 4.19160839e+00  4.04030769e+00  2.09121989e+02 -2.13991303e-01
   8.60923077e+02]
 [ 6.50000000e+00  6.52311111e+00  3.48765432e+02  3.56965903e-01
   9.63333333e+01]
 [ 6.35454545e+00  6.36666667e+00  3.39966330e+02  4.95778068e-01
   6.68266667e+02]
 [ 2.43223140e+00  2.14800000e+00  1.02690542e+02 -4.44155735e-01
   6.24909091e+02]
 [ 2.74090909e+00  2.48000000e+00  1.21363636e+02  3.43625058e-01
   9.97500000e+01]
 [ 7.45568182e+00  7.55100000e+00  4.06578283e+02  9.94071438e-03
   4.85375000e+02]
 [ 1.35000000e+00  9.84000000e-01  3.72222222e+01 -1.37360517e-01
   8.82666667e+02]
 [ 7.58677686e+00  7.69200000e+00  4.14508724e+02  3.16441152e-01
   8.97454545e+02]
 [ 3.51818182e+00  3.31600000e+00  1.68383838e+02  2.33791013e-01
   3.34800000e+02]
 [ 8.13272727e+00  8.27920000e+00  4.47535354e+02 -1.16872782e-01
   2.18800000e+02]]


In [232]:
model.predict(X_test)

array([8, 3, 8, 6, 3, 8, 4, 0, 4, 2, 8, 8, 1, 2, 9, 7, 9, 5, 7, 1])