In [1]:
import numpy as np
import pandas as pd

In [30]:
from sklearn.datasets import make_blobs

X, _ = make_blobs(n_samples=100, centers=5, n_features=5, cluster_std=2.5, random_state=42)
X = pd.DataFrame(X)
X.columns = [f'col_{col}' for col in X.columns]

In [296]:
import copy 

class MyKMeans():
    #Class parameters:
    # n_clusters – number of clusters, default 3
    # max_iter - number of iterations , default 10
    #n_init - number of times to run KMeans, default 3
    #random_state, default 42
    def __init__(self, n_clusters = 3, max_iter = 10, n_init = 3, random_state = 42):
        self.n_clusters= n_clusters
        self.max_iter = max_iter
        self.n_init = n_init
        self.random_state = random_state
        
        self.cluster_centers_ = [] #cluster centroids
        self.new_centroids_values = []
        self.clusters = [] #clusters
        self.inertia_ = 0
        
        
        
    def __repr__(self):
        return f'MyKMeans class: n_clusters={self.n_clusters}, max_iter={self.max_iter}, n_init={self.n_init}, random_state={self.random_state}'
    
    def fit(self,X): #receives pandas dataframe with features
      
        for l in range(self.n_init):
            self.clusters.append([])
            self.cluster_centers_.append([])
            for i in range(self.n_clusters):
                self.clusters[l].append([])
                self.cluster_centers_[l].append([])                
                
        #fix the random seed
        np.random.seed(self.random_state)
        
        for l in range(self.n_init):
            #set random values of centroids
            for i in range(self.n_clusters):
                self.cluster_centers_[l][i] = [np.random.uniform(X.iloc[:,k].min(),X.iloc[:,k].max()) for k in range(X.shape[1])]
                     
            
            for j in range(self.max_iter):
                
                for i in range(self.n_clusters):
                      self.clusters[l][i] = []
                
                #distribute points into clusters based on shortest distance from centroids
                for row in range(X.shape[0]):
                    D = [self.euclidean_distance(X.loc[row,:], np.array(self.cluster_centers_[l][i])) for i in range(self.n_clusters)]
                    min_idx = np.argmin(D)
                    self.clusters[l][min_idx].append(row)
              
                
                for i in range(self.n_clusters):
                    if len(self.clusters[l][i]) != 0:
                        self.cluster_centers_[l][i] = [X.loc[self.clusters[l][i],:].iloc[:,k].mean() for k in range(X.shape[1])]
                   
                if j>0:
                    if cluster_centers_old == self.cluster_centers_[l]:
                        break
                
                cluster_centers_old = copy.deepcopy(self.cluster_centers_[l])
                
                     
        WCSS = np.zeros(self.n_init)
        for l in range(self.n_init):
            WCSS[l] = np.sum([self.get_WCSS(X, self.clusters[l][i], self.cluster_centers_[l][i]) for i in range(self.n_clusters)])
            
        self.inertia_ = np.min(WCSS)
        self.cluster_centers_  = self.cluster_centers_[np.argmin(WCSS)]
        
        
    def predict(self, X):
        clustering = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            clustering[i] = np.argmin([self.euclidean_distance(X.loc[i,:], self.cluster_centers_[k]) for k in range(self.n_clusters)])
        return clustering
                        
    def euclidean_distance(self, x2, x1): #receives to numpy arrays
        return np.sqrt(np.sum(np.square(x2-x1)))
    
    def get_WCSS(self, X, d, C): #receives panda dataframe, point indeces and cluster centroid
        WCSS = 0
        for i in range(len(d)):
            WCSS = WCSS + self.euclidean_distance(X.loc[d[i],:].values, C)**2
        return WCSS