# K-Means

1. Notebooks
   - [from scratch](https://www.kaggle.com/code/fareselmenshawii/kmeans-from-scratch)
   - [vectorized](https://www.kaggle.com/code/fareselmenshawii/vectorization)
   - [sklearn](https://www.kaggle.com/code/fareselmenshawii/kmeans-iris-clustering)

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

## Data

In [3]:
iris = pd.read_csv('./data/Iris.csv')
iris.drop('Id', inplace=True, axis=1)
iris.head().style.background_gradient(cmap=sns.cubehelix_palette(as_cmap=True))

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
x = iris.iloc[:, :-1]
y = iris.iloc[:, -1]

In [5]:
fig = px.box(data_frame=iris, x='Species',y='SepalLengthCm',color='Species',color_discrete_sequence=['#29066B','#7D3AC1','#EB548C'],orientation='v')
fig.show()

## Model

In [None]:
import numpy as np

class Kmeans:
    def __init__(self, K):
        self.k = K
    
    def init_centr(self, x):
        assert x.shape[0] >= self.k
        rand_x = np.random.permutation(x.shape[0])
        centr_idx = rand_x[:self.k]
        self.centr = x[centr_idx]

    def assign_centr(self, x):
        # distances between each point and each centroid
        distance = np.linalg.norm(x[:, np.newaxis] - self.centr, axis=2)
        points = np.argmin(distance, axis=1)  # closest centroid
        assert len(points) == x.shape[0]
        return points
    
    def cmp_mean(self, x, points):
        new_centr = []
        for i in range(self.k):
            cluster_points = x[points == i]
            if len(cluster_points) == 0:
                # reinitialize empty cluster randomly
                new_centr.append(x[np.random.randint(0, len(x))])
            else:
                new_centr.append(cluster_points.mean(axis=0))
        return np.vstack(new_centr)
        
    def fit(self, x, iter=10):
        self.init_centr(x)

        for _ in range(iter):
            points = self.assign_centr(x)
            self.centr = self.cmp_mean(x, points)
            
            # safe checks 3shan el error
            assert self.centr.shape == (self.k, x.shape[1])
            assert points.max() < self.k
            assert points.min() >= 0

        return self.centr, points


In [14]:
kmeans = Kmeans(3)
centroids, points = kmeans.fit(x, 1000)

In [15]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=x[points == 0, 0], y=x[points == 0, 1],
    mode='markers',marker_color='#DB4CB2',name='Iris-setosa'
))

fig.add_trace(go.Scatter(
    x=x[points == 1, 0], y=x[points == 1, 1],
    mode='markers',marker_color='#c9e9f6',name='Iris-versicolour'
))

fig.add_trace(go.Scatter(
    x=x[points == 2, 0], y=x[points == 2, 1],
    mode='markers',marker_color='#7D3AC1',name='Iris-virginica'
))

fig.add_trace(go.Scatter(
    x=centroids[:, 0], y=centroids[:,1],
    mode='markers',marker_color='#CAC9CD',marker_symbol=4,marker_size=13,name='Centroids'
))
fig.update_layout(template='plotly_dark',width=1000, height=500,)

## SKlearn

In [None]:
# elbpw method => find optim no. clusters
sse = []
for i in range(1,9):
    kmeans = KMeans(n_clusters=i, max_iter=300)
    kmeans.fit(x)
    sse.append(kmeans.inertia_) #inertia_?

fig = px.line(y=sse,template="seaborn",title='Eblow Method')
fig.update_layout(width=800, height=600,
title_font_color="#BF40BF", 
xaxis=dict(color="#BF40BF",title="Clusters"), 
yaxis=dict(color="#BF40BF",title="SSE"))


In [20]:
# optim = 3
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
clusters = kmeans.fit_predict(x)

In [21]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=x[points == 0, 0], y=x[points == 0, 1],
    mode='markers',marker_color='#DB4CB2',name='Iris-setosa'
))

fig.add_trace(go.Scatter(
    x=x[points == 1, 0], y=x[points == 1, 1],
    mode='markers',marker_color='#c9e9f6',name='Iris-versicolour'
))

fig.add_trace(go.Scatter(
    x=x[points == 2, 0], y=x[points == 2, 1],
    mode='markers',marker_color='#7D3AC1',name='Iris-virginica'
))

fig.add_trace(go.Scatter(
    x=centroids[:, 0], y=centroids[:,1],
    mode='markers',marker_color='#CAC9CD',marker_symbol=4,marker_size=13,name='Centroids'
))
fig.update_layout(template='plotly_dark',width=1000, height=500,)