Import libraries

Asaf Benor

In [5]:
import scipy.io
import scipy.special
import numpy as np
import matplotlib.pyplot as plt

# Importing MNIST data
mnist_dataset = scipy.io.loadmat('mnist_all.mat')  # Loading the dataset from a .mat file

In [None]:
# Setting the number of iterations for the K-means algorithm
num_iterations = 45
# Combining test data for all digits (0-9) into one array
combined_test_data = np.vstack([mnist_dataset[f'test{digit}'] for digit in range(10)])
data_shape = combined_test_data.shape
# Calculating the total number of elements in the dataset
total_elements = data_shape[0] * data_shape[1]
# Preparing a cost tracking array
error_tracking = np.zeros(num_iterations - 2)
# Number of clusters
clusters = 10

# Flattening the image data for processing
flattened_data = combined_test_data.reshape(total_elements, -1)

# K-means Initialization
initial_indices = np.random.randint(0, 256, size=clusters)
cluster_centers = np.zeros((clusters, flattened_data.shape[1]))

for idx in range(clusters):
    cluster_centers[idx, :] = initial_indices[idx]

# K-means Clustering Process
for iteration in range(2, num_iterations - 2):
    responsibilities = np.zeros((total_elements, clusters))
    distances = np.zeros((total_elements, clusters))

    # Compute distances and update responsibilities
    for element_idx in range(total_elements):
        for cluster_idx in range(clusters):
            distances[element_idx, cluster_idx] = np.sum((flattened_data[element_idx, :] - cluster_centers[cluster_idx, :]) ** 2)
        min_distance_idx = np.argmin(distances[element_idx, :])
        responsibilities[element_idx, min_distance_idx] = 1

    # Update cluster centers based on new assignments
    cluster_sum = np.zeros(cluster_centers.shape)
    sum_responsibilities = np.sum(responsibilities, axis=0)
    total_cost = 0

    for element_idx in range(total_elements):
        for cluster_idx in range(clusters):
            total_cost += responsibilities[element_idx, cluster_idx] * distances[element_idx, cluster_idx]
            cluster_sum[cluster_idx, :] += responsibilities[element_idx, cluster_idx] * flattened_data[element_idx, :]

    for cluster_idx in range(clusters):
        if sum_responsibilities[cluster_idx] > 0:
            cluster_centers[cluster_idx, :] = cluster_sum[cluster_idx, :] / sum_responsibilities[cluster_idx]

    error_tracking[iteration - 2] = total_cost

    # Check for convergence
    if iteration > 2 and abs(error_tracking[iteration - 2] - error_tracking[iteration - 3]) < 5:
        print(f"Converged at iteration {iteration}")
        break

    print(f"Cost at iteration {iteration} is {total_cost}")


# Visualizing the cost reduction
iteration_axis = np.arange(1, num_iterations - 2)
plt.plot(iteration_axis, error_tracking)
plt.title("cost as a function of iterations")
plt.xlabel("Iteration Number")
plt.ylabel("Total Error")
plt.xlim([1, iteration_axis.size])
plt.show()