# K-Means Clustering in Python

### Use Cases of Clustering

-----------------------------------------
##### Audience Segmentation
![](img/audience.png)


-----------------------------------------
##### Market Segmentation
![](img/market.png)


#### Importing Packages

In [None]:
from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')

%matplotlib inline

#### Importing Data 

In [None]:
df = pd.read_csv('data/xclara.csv')
print("Input Data and Shape")
print(df.shape)
df.head()

X = df.values
X

#### EDA

In [None]:
df.plot.scatter(x = "V1", y ="V2");

#### How Algorithm Works

![](img/steps.png)

![](img/ex_gif.gif)


### Step-1 Pick K random points as cluster centers called centroids.
###### Choosing Optimal K - Elbow Method 
###### (in this case we can choose K since thats how data spreaded)

![](img/elbow.png)

In [None]:
k =3
# X coordinates of random centroids
C_x = np.random.randint(0, np.max(X[:, 0])-20, size=k)
# Y coordinates of random centroids
C_y = np.random.randint(0, np.max(X[:, 1])-20, size=k)
C = np.array(list(zip(C_x, C_y)), dtype=np.float32)
print("Initial Centroids")
print(C)

In [None]:
# Plotting along with the Centroids
plt.scatter(df.V1, df.V2, c='#050505', s=7)
plt.scatter(C_x, C_y, marker='o', s=200, c='r');

### Step 2 - Assign each $x_{i}$  to nearest cluster by calculating its distance to each centroid

In [None]:
### Euclidean Distance
def eucli(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

In [None]:
# To store the value of centroids when it updates
C_old = np.zeros(C.shape)
# Cluster Lables(0, 1, 2)
clusters = np.zeros(len(X))
# Assigning each value to its closest cluster
dist = []
for i in range(len(X)):
    distances = eucli(X[i], C)
    cluster = np.argmin(distances)
    clusters[i] = cluster

### Step 3 - Find new cluster center by taking the average of the assigned points.

In [None]:
error = eucli(C, C_old, None)
print("Initial Clusters::\n")
print(pd.Series(clusters).value_counts())
print("Initial Error::", error)
C_old = deepcopy(C)

In [None]:
for i in range(k):
    points = [X[j] for j in range(len(X)) if clusters[j] == i]
    C[i] = np.mean(points, axis=0)

### Step 4 - Repeat Step 2 and 3 until none of the cluster assignments change(i.e error becomes 0)

In [None]:
# To store the value of centroids when it updates
C_old = np.zeros(C.shape)
# Cluster Lables(0, 1, 2)
clusters = np.zeros(len(X))
# Error func. - Distance between new centroids and old centroids
error = eucli(C, C_old, None)
# Loop will run till the error becomes zero
while error != 0:
    # Assigning each value to its closest cluster
    for i in range(len(X)):
        distances = eucli(X[i], C)
        cluster = np.argmin(distances)
        clusters[i] = cluster
    # Storing the old centroid values
    C_old = deepcopy(C)
    # Finding the new centroids by taking the average value
    for i in range(k):
        points = [X[j] for j in range(len(X)) if clusters[j] == i]
        C[i] = np.mean(points, axis=0)
    error = eucli(C, C_old, None)

In [None]:
colors = ['r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()
for i in range(k):
        points = np.array([X[j] for j in range(len(X)) if clusters[j] == i])
        ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
ax.scatter(C[:, 0], C[:, 1], marker='o', s=200, c='#050505');


In [None]:
print("Final Clusters::\n")
print(pd.Series(clusters).value_counts())
print("Final Error::", error)

In [None]:
#### Comparing with SKLEARN
from sklearn.cluster import KMeans

# Number of clusters
kmeans = KMeans(n_clusters=3)
# Fitting the input data
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_

# Comparing with scikit-learn centroids
print("Centroid values")
print("Scratch")
print(C) # From Scratch
print("sklearn")
print(centroids) # From sci-kit learn