In [None]:
import tensorflow as tf
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
# retrieving the data from tensorflow and concatenating it because it is splitted
mnist = tf.keras.datasets.mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# concatenation of the splitted data
X = np.concatenate((X_train, X_test), axis=0)  # X has all the images
y = np.concatenate((y_train, y_test), axis=0)  # Y has the labels

print(X.shape)
print(y.shape)

# Flatenning the X data because it is in 28 x 28
data = X.reshape(X.shape[0], -1)
print(data.shape)

(70000, 28, 28)
(70000,)
(70000, 784)


In [None]:
print(data[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   3  18  18  18 126 136 175  26 166 255
 247 127   0   0   0   0   0   0   0   0   0   0   0   0  30  36  94 154
 170 253 253 253 253 253 225 172 253 242 195  64   0   0   0   0   0   0
   0   0   0   0   0  49 238 253 253 253 253 253 253 253 253 251  93  82
  82  56  39   0   0   0   0   0   0   0   0   0   0   0   0  18 219 253
 253 253 253 253 198 182 247 241   0   0   0   0   

In [None]:
# Initializing centroids
def init_centroids(data, k):
    """Selects k initial centroids randomly from the data.
    Inputs:
    - data: Dataset of shape (n_samples, n_features).
    - k: Number of clusters.
    output:
    - centroids: Initial centroids selected randomly from the dataset.
    """
    np.random.seed(42)  # Set a random seed for reproducibility
    return data[np.random.choice(data.shape[0], k, replace=False)]  # Randomly select k data points as centroids


In [None]:
# E-step: given the centroids u_k, computation of the new membership (pi_ik)
def e_step(data, centroids):
    # Calculate distances between each data point and each centroid
    distances = euclidean_distances(data, centroids)

    # Initialize a binary membership matrix with the same shape as distances
    memberships = np.zeros_like(distances)

    # Assign each data point to the closest centroid
    memberships[np.arange(distances.shape[0]), np.argmin(distances, axis=1)] = 1

    # Return the membership matrix: [n_samples, k].
    # If 1, it means the data point x_i is assigned to the cluster k.
    return memberships


In [None]:
# Objective function: computes the total cost J for k-means
def compute_objective(data, centroids, memberships):
    # Compute the squared distances between each data point and each centroid
    dist_squared = euclidean_distances(data, centroids) ** 2

    # Multiply distances by the membership matrix and sum to compute J
    # This adds up the distances for the assigned clusters only
    J = np.sum(memberships * dist_squared)

    # Return the value of the objective function J
    return J



In [None]:
# M-step: update the centroids (u_k) based on the current memberships
def m_step(data, memberships):
    # Compute the new centroids (u_k)
    # Numerator: weighted sum of the data points for each cluster
    # Denominator: total number of data points assigned to each cluster
    return np.dot(memberships.T, data) / np.sum(memberships.T, axis=1, keepdims=True)


In [None]:
# KMeans function
def kmeans(data, k, max_iter):
    # Init
    centroids = init_centroids(data, k)

    # list to stor the J's
    J_history = []

    for iteration in range(max_iter):
        # E-step updating memberships
        memberships = e_step(data, centroids)

        # objective function after e
        J_e = compute_objective(data, centroids, memberships)
        J_history.append((iteration + 1, 'E-step', J_e))
        print(f"Iteration {iteration + 1} (E-step), Objective Function J: {J_e}")

        # M-step: uk update
        centroids = m_step(data, memberships)

        # objective function after m
        J_after = compute_objective(data, centroids, memberships)
        J_history.append((iteration + 1, 'M-step', J_after))
        print(f"Iteration {iteration + 1} (M-step), Objective Function J: {J_after}")

    # results
    return centroids, memberships, J_history

In [None]:
# normalizing the data before applying kmeans
data = data/255

In [None]:
print(data[1])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

In [None]:
# Calling K means K=5
k = 5
max_iter = 10
centroids, memberships, J_values = kmeans(data, k, max_iter)

Iteration 1 (E-step), Objective Function J: 5760108.875001922
Iteration 1 (M-step), Objective Function J: 3390782.9893569555
Iteration 2 (E-step), Objective Function J: 3283140.2034430513
Iteration 2 (M-step), Objective Function J: 3214890.5095032346
Iteration 3 (E-step), Objective Function J: 3183235.4752597953
Iteration 3 (M-step), Objective Function J: 3162499.897672526
Iteration 4 (E-step), Objective Function J: 3146678.531293525
Iteration 4 (M-step), Objective Function J: 3131764.1825346327
Iteration 5 (E-step), Objective Function J: 3118478.170900364
Iteration 5 (M-step), Objective Function J: 3105396.5228889287
Iteration 6 (E-step), Objective Function J: 3094932.883492146
Iteration 6 (M-step), Objective Function J: 3086532.7163389493
Iteration 7 (E-step), Objective Function J: 3081304.5110710156
Iteration 7 (M-step), Objective Function J: 3077538.3769799634
Iteration 8 (E-step), Objective Function J: 3074808.0826217574
Iteration 8 (M-step), Objective Function J: 3072469.46478786

In [None]:
# calling k means k=20
k = 20
max_iter = 10
centroids, memberships,J_values = kmeans(data, k, max_iter)


Iteration 1 (E-step), Objective Function J: 4508418.612010767
Iteration 1 (M-step), Objective Function J: 2840115.117477757
Iteration 2 (E-step), Objective Function J: 2699624.9908458497
Iteration 2 (M-step), Objective Function J: 2623393.778320279
Iteration 3 (E-step), Objective Function J: 2581401.5281632277
Iteration 3 (M-step), Objective Function J: 2551951.121088409
Iteration 4 (E-step), Objective Function J: 2533682.039337663
Iteration 4 (M-step), Objective Function J: 2521760.714705537
Iteration 5 (E-step), Objective Function J: 2514138.596529358
Iteration 5 (M-step), Objective Function J: 2508363.8102459903
Iteration 6 (E-step), Objective Function J: 2504058.7351279487
Iteration 6 (M-step), Objective Function J: 2500442.46244785
Iteration 7 (E-step), Objective Function J: 2497362.4613309023
Iteration 7 (M-step), Objective Function J: 2494487.825513134
Iteration 8 (E-step), Objective Function J: 2491936.726196248
Iteration 8 (M-step), Objective Function J: 2489520.7883204618
Ite

In [None]:
#Calling k means
k = 10
max_iter = 10
centroids, memberships, J_values = kmeans(data, k, max_iter)

Iteration 1 (E-step), Objective Function J: 5120891.04139946
Iteration 1 (M-step), Objective Function J: 3087420.501402538
Iteration 2 (E-step), Objective Function J: 2950927.376023708
Iteration 2 (M-step), Objective Function J: 2869625.747947105
Iteration 3 (E-step), Objective Function J: 2831456.3746766816
Iteration 3 (M-step), Objective Function J: 2809540.8607475697
Iteration 4 (E-step), Objective Function J: 2798441.6962375813
Iteration 4 (M-step), Objective Function J: 2792236.0314630615
Iteration 5 (E-step), Objective Function J: 2788555.2622218677
Iteration 5 (M-step), Objective Function J: 2785997.6087895078
Iteration 6 (E-step), Objective Function J: 2784125.9819741175
Iteration 6 (M-step), Objective Function J: 2782637.1165767605
Iteration 7 (E-step), Objective Function J: 2781269.427014939
Iteration 7 (M-step), Objective Function J: 2779708.1123856446
Iteration 8 (E-step), Objective Function J: 2777873.682968295
Iteration 8 (M-step), Objective Function J: 2775425.3485100907

In [None]:
#retrieving clusters with k=10
clusters = np.argmax(memberships, axis=1)

In [None]:
print(clusters.shape)

(70000,)


In [None]:
# Function for Purity
# Purity measures how well the clusters align with the true labels
from collections import Counter

def purity(clusters, labels):
    correct_labels = 0  # Initialize a counter to keep track of correctly assigned points
    n_clusters = len(np.unique(clusters))  # Total number of clusters

    for cluster in range(n_clusters):
        indices = np.where(clusters == cluster)[0]  # Get the indices of points in this cluster
        cluster_labels = labels[indices]  # Get the true labels for the points in this cluster
        most_freq = Counter(cluster_labels).most_common(1)[0][1]  # Find the most frequent label in the cluster
        correct_labels += most_freq  # Add the count of correct predictions for this cluster

    return correct_labels / len(labels)  # Compute and return purity as the fraction of correct predictions




In [None]:
mnist_purity = purity(clusters, y)
print(f"Purity: {mnist_purity}")


Purity: 0.6017428571428571


In [None]:
# Function for Gini Index
# The Gini index measures the heterogeneity of labels within clusters

def gini_index(clusters, Y):
    n_samples = len(Y)  # Total number of samples in the dataset
    n_clusters = len(np.unique(clusters))  # Total number of unique clusters
    total_gini = 0  # Initialize the weighted Gini index accumulator

    for cluster in range(n_clusters):
        # Get indices of data points assigned to the current cluster
        indices = np.where(clusters == cluster)[0]
        cluster_labels = Y[indices]  # Get the true labels of the points in this cluster

        # Compute the total number of points in the cluster
        total_points = len(cluster_labels)
        if total_points == 0:
            continue  # Skip the cluster if it's empty

        # Count the occurrences of each label in the cluster
        label_counts = Counter(cluster_labels)
        # Compute the proportions of each label
        proportions = np.array([count / total_points for count in label_counts.values()])

        # Calculate the Gini index for this cluster
        gini = 1 - np.sum(proportions**2)  # Gini index formula: 1 - sum(p_j^2)
        total_gini += total_points * gini  # Weight the Gini index by cluster size

    # Calculate the global Gini index by averaging over all clusters
    global_gini = total_gini / n_samples
    return global_gini




In [None]:
#gini computation
gini_mnist = gini_index(clusters, y)
print(f"Gini: {gini_mnist}")

Gini: 0.5201990924664515
