In [None]:
import pandas as pd
import tensorflow as tf
import sklearn
import numpy as np

In [None]:
import zipfile
# Function to load images from a ZIP file
def load_images_from_zip(zip_path, file_name):
    with zipfile.ZipFile(zip_path, 'r') as zip_obj:
        with zip_obj.open(file_name) as f:
            # Read raw bytes and skip the first 16 bytes (metadata)
            data = np.frombuffer(f.read(), np.uint8, offset=16)
            # Reshape data into (n_samples, 28, 28)
            data = data.reshape(-1, 28, 28)
    return data

# Function to load labels from an uncompressed file
def load_labels(file_name):
    with open(file_name, 'rb') as f:
        # Read raw bytes and skip the first 8 bytes (metadata)
        labels = np.frombuffer(f.read(), np.uint8, offset=8)
    return labels

In [None]:
# Loadind data and labels
train_images_zip = 'train-images-idx3-ubyte.zip'
train_labels_file = 'train-labels-idx1-ubyte'
test_images_zip = 't10k-images-idx3-ubyte.zip'
test_labels_file = 't10k-labels-idx1-ubyte'

In [None]:
# Load images and labels
train_images = load_images_from_zip(train_images_zip, 'train-images-idx3-ubyte')
train_labels = load_labels(train_labels_file)
test_images = load_images_from_zip(test_images_zip, 't10k-images-idx3-ubyte')
test_labels = load_labels(test_labels_file)

In [None]:
# Flatten and normalize images to values between 0 and 1
train_images_flat = train_images.reshape(train_images.shape[0], -1) / 255.0
test_images_flat = test_images.reshape(test_images.shape[0], -1) / 255.0


In [None]:
#Combining images and labels
X = np.concatenate([train_images_flat, test_images_flat], axis = 0)

#labels
y = np.concatenate([train_labels, test_labels], axis = 0)

In [None]:
#checking shapes
print(X.shape)

(70000, 784)


In [None]:
print(y.shape)

(70000,)


#**Starts with KMeans**

In [None]:
# Initialization
def init_centroids (k, data):
    np.random.seed(42)
    return data[np.random.choice(data.shape[0], k, replace=False)]

In [None]:
# E step
# from the previous centroids, computation of the membership pi_ik
from sklearn.metrics import pairwise_distances

def e_step(data,centroids):
    distances = pairwise_distances(data, centroids, metric='euclidean')

    #assigning each xi to the closest cluster
    memberships = np.zeros_like(distances)
    memberships[np.arange(distances.shape[0]), np.argmin(distances, axis=1)] = 1

    return memberships  # Binary matrix [n_samples, k]. If 1 = x_i is assigned to the cluster

In [None]:
# Objective fn
from sklearn.metrics import pairwise_distances

# Objective Function
def compute_objective(data, centroids, memberships):
    # Compute squared Euclidean distances using scikit-learn
    dist_squared = pairwise_distances(data, centroids, metric='euclidean') ** 2

    # Compute the objective by summing the weighted distances
    J = np.sum(memberships * dist_squared)
    return J



In [None]:
# M step: giving the memberships computation of the new centroids u_k
def m_step(data, memberships):
    return np.dot(memberships.T, data) / np.sum(memberships.T, axis = 1, keepdims=True)

In [None]:
# Kmeans function
def kmeans(data, k, max_iter):
    # Init
    centroids = init_centroids(k,data)

    # list to stor the J's
    J_history = []

    for iteration in range(max_iter):
        # E-step updating memberships
        memberships = e_step(data, centroids)

        # objective function after e
        J_e = compute_objective(data, centroids, memberships)
        J_history.append((iteration + 1, 'E-step', J_e))
        print(f"Iteration {iteration + 1} (E-step), Objective Function J: {J_e}")

        # M-step: uk update
        centroids = m_step(data, memberships)

        # objective function after m
        J_after = compute_objective(data, centroids, memberships)
        J_history.append((iteration + 1, 'M-step', J_after))
        print(f"Iteration {iteration + 1} (M-step), Objective Function J: {J_after}")

    # results
    return centroids, memberships, J_history

In [None]:
# Kmeans K=10
k=10
max_iter = 10
centroids_10, memberships_10, J_history_10 = kmeans(X, k, max_iter)

Iteration 1 (E-step), Objective Function J: 4236695.799092658
Iteration 1 (M-step), Objective Function J: 2760512.7082585916
Iteration 2 (E-step), Objective Function J: 2541014.3716488187
Iteration 2 (M-step), Objective Function J: 2415515.9232988353
Iteration 3 (E-step), Objective Function J: 2348707.1128217475
Iteration 3 (M-step), Objective Function J: 2310055.2458272697
Iteration 4 (E-step), Objective Function J: 2289294.232477308
Iteration 4 (M-step), Objective Function J: 2277105.4547846485
Iteration 5 (E-step), Objective Function J: 2269817.2262748675
Iteration 5 (M-step), Objective Function J: 2264464.747894467
Iteration 6 (E-step), Objective Function J: 2259566.2636823566
Iteration 6 (M-step), Objective Function J: 2253369.488245678
Iteration 7 (E-step), Objective Function J: 2245926.2533060806
Iteration 7 (M-step), Objective Function J: 2237640.0463701626
Iteration 8 (E-step), Objective Function J: 2233847.103795133
Iteration 8 (M-step), Objective Function J: 2231743.60183571

In [None]:
#getting the clusters
clusters = np.argmax(memberships_10, axis=1)
print(clusters.shape)

(70000,)


In [None]:
# Kmeans K=20
k=20
max_iter = 10
centroids_20, memberships_20, J_history_20 = kmeans(X, k, max_iter)

Iteration 1 (E-step), Objective Function J: 3757736.9317185725
Iteration 1 (M-step), Objective Function J: 2341121.148057056
Iteration 2 (E-step), Objective Function J: 2141357.0369235794
Iteration 2 (M-step), Objective Function J: 2036826.0256631474
Iteration 3 (E-step), Objective Function J: 1978064.4681071634
Iteration 3 (M-step), Objective Function J: 1940199.66100785
Iteration 4 (E-step), Objective Function J: 1920957.1048904585
Iteration 4 (M-step), Objective Function J: 1909343.518553321
Iteration 5 (E-step), Objective Function J: 1901142.6606707193
Iteration 5 (M-step), Objective Function J: 1894427.831253024
Iteration 6 (E-step), Objective Function J: 1890193.0565327064
Iteration 6 (M-step), Objective Function J: 1887091.7612746777
Iteration 7 (E-step), Objective Function J: 1885010.328657064
Iteration 7 (M-step), Objective Function J: 1883425.8229371484
Iteration 8 (E-step), Objective Function J: 1882186.7975490654
Iteration 8 (M-step), Objective Function J: 1881129.503531172

In [None]:
# Kmeans K=5
k=5
max_iter = 10
centroids_5, memberships_5, J_history_5 = kmeans(X, k, max_iter)

Iteration 1 (E-step), Objective Function J: 5162296.909219533
Iteration 1 (M-step), Objective Function J: 3081998.2514617834
Iteration 2 (E-step), Objective Function J: 2888995.3869262803
Iteration 2 (M-step), Objective Function J: 2842006.7224713936
Iteration 3 (E-step), Objective Function J: 2821854.8095341204
Iteration 3 (M-step), Objective Function J: 2809171.16700695
Iteration 4 (E-step), Objective Function J: 2800261.5059081027
Iteration 4 (M-step), Objective Function J: 2793461.846557532
Iteration 5 (E-step), Objective Function J: 2787735.1031088172
Iteration 5 (M-step), Objective Function J: 2782468.0904254788
Iteration 6 (E-step), Objective Function J: 2778056.775476995
Iteration 6 (M-step), Objective Function J: 2774273.276290742
Iteration 7 (E-step), Objective Function J: 2771113.7560496484
Iteration 7 (M-step), Objective Function J: 2768458.2259303713
Iteration 8 (E-step), Objective Function J: 2766409.9114700914
Iteration 8 (M-step), Objective Function J: 2764724.623932040

In [None]:
# Function for Purity
# Purity measures the diversity of the labels
from collections import Counter

def purity(clusters, labels):
    correct_labels = 0  # Initialize counter for correct predictions
    n_clusters = len(np.unique(clusters))  # Number of clusters

    for cluster in range(n_clusters):
        indices = np.where(clusters == cluster)[0]  # Get indices of data points in the cluster
        cluster_labels = labels[indices]  # Get labels of those points
        most_freq = Counter(cluster_labels).most_common(1)[0][1]  # Find the most frequent label
        correct_labels += most_freq  # Count correct predictions for this cluster

    return correct_labels / len(labels)  # Calculate purity as a fraction

In [None]:
fashion_purity = purity(clusters, y)
print(f"Purity: {fashion_purity}")

Purity: 0.5464142857142857


In [None]:
# FN for gini index

def gini_index(clusters, Y):
    n_samples = len(Y)  # Total number of samples
    n_clusters = len(np.unique(clusters))  # Number of unique clusters
    total_gini = 0  # Initialize total weighted Gini

    for cluster in range(n_clusters):
        # Get indices of points belonging to the current cluster
        indices = np.where(clusters == cluster)[0]
        cluster_labels = Y[indices]

        # Compute the proportion of each label in the cluster
        total_points = len(cluster_labels)
        if total_points == 0:
            continue  # Skip empty clusters

        label_counts = Counter(cluster_labels)  # Count labels in the cluster
        proportions = np.array([count / total_points for count in label_counts.values()])

        # Calculate the Gini index for this cluster
        gini = 1 - np.sum(proportions**2)
        total_gini += total_points * gini  # Weight Gini by cluster size

    # Compute the global Gini index
    global_gini = total_gini / n_samples
    return global_gini


In [None]:
#gini computation
gini_fashion = gini_index(clusters, y)
print(f"Gini: {gini_fashion}")

Gini: 0.5716751925509309
