# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

# Data Preprocessing

In [2]:
data = pd.read_csv('multigauss.txt', sep = " ", header = None)

In [3]:
data.head()

Unnamed: 0,0,1,2
0,#,name:,XX
1,#,type:,matrix
2,#,rows:,1250
3,#,columns:,2
4,,1.343973435284349,0.1511043422294572


In [4]:
data.drop(0, axis = 1, inplace = True)

In [5]:
data.drop([0, 1, 2, 3], axis = 0, inplace = True)

In [6]:
data.head()

Unnamed: 0,1,2
4,1.343973435284349,0.1511043422294572
5,0.9626717660160128,0.3827812289862362
6,0.5023257633515988,-0.1813927047883676
7,0.3677464033583122,0.804004582610822
8,-0.8079136978717317,0.2692090961704308


In [7]:
data = data.astype('float64')

In [8]:
data.isna().sum()

1    0
2    0
dtype: int64

# Modelling

In [9]:
class KMeans():
    def __init__(self):
        self.mu = None
        self.std = None
        self.k = None
    
    def fit(self, X, k, random_state = 42):
        self.mu = X.mean()
        self.std = X.std()
        X = (X - self.mu) / self.std
        rng = np.random.default_rng(random_state)
        self.k = rng.random((k, X.shape[1]))
        self.k = (self.k - self.mu.values) / self.std.values
        prev = self.k.copy()
        while True:
            distances = []
            #calculcate the distance between each cluster and every point
            for k in self.k:
                diff = X - k
                dist = self.getDist(diff)
                distances.append(dist.values)
            distances = np.array(distances)
            
            #find the minimum distances and assign clusters
            clusters = np.argmin(distances, axis = 0)
            
            #get indices of clusters
            for c in np.unique(clusters):
                data = X[clusters == c]
                self.k[c] = np.mean(data, axis = 0)
            
            #breaking condition
            if np.array_equal(self.k, prev): break
            prev = self.k.copy()
    
    def predict(self, X):
        X = (X - self.mu) / self.std
        distances = []
        for k in self.k:
            diff = X - k
            dist = self.getDist(diff)
            distances.append(dist)
        distances = np.array(distances)
        return np.argmin(distances, axis = 0)
    
    def getDist(self, a):
        # return Euclidean Distance
        return np.sqrt(np.sum(np.square(a), axis = 1))

In [28]:
kmeans = KMeans()

In [33]:
kmeans.fit(data, 10)

In [34]:
pred = kmeans.predict(data)

In [35]:
pred

array([9, 9, 9, ..., 2, 2, 2], dtype=int64)

In [36]:
kmeans.k

array([[ 1.29863641, -0.87778869],
       [ 1.28560755,  0.86830648],
       [-1.07256782,  1.08178296],
       [ 1.14170246,  1.48057849],
       [-1.18141164, -1.21160269],
       [ 0.71198622,  1.00486803],
       [-0.24810802,  0.24288952],
       [-0.81648073, -0.74807219],
       [ 0.87556055, -1.23505695],
       [ 0.23952082, -0.15342907]])