# Import Libraries

In [1]:
import numpy as np
import cv2
from keras.datasets import mnist
from sklearn.metrics import pairwise_distances_argmin_min
from google.colab.patches import cv2_imshow
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score

# Load Data

In [2]:
# Load MNIST dataset
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Concatenate train_images with test_images
images = np.concatenate((train_images, test_images), axis=0)
# images=images/255
# Concatenate train_labels with test_labels
labels = np.concatenate((train_labels, test_labels), axis=0)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
# Create train-test split
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

In [4]:
unique_values, counts = np.unique(labels,return_counts=True)
unique_values,counts

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8),
 array([6903, 7877, 6990, 7141, 6824, 6313, 6876, 7293, 6825, 6958]))

#Centroid Function

In [5]:
def extract_features(image, rows, cols):
    features = []
    grid_height = image.shape[0] // rows
    grid_width = image.shape[1] // cols

    for i in range(rows):
        for j in range(cols):
            # Calculate coordinates for the current grid cell
            x1 = i * grid_height
            x2 = x1 + grid_height
            y1 = j * grid_width
            y2 = y1 + grid_width

            # Extract the sub-image (grid cell) from the original image
            grid_cell = image[x1:x2, y1:y2]

            # Calculate the centroid coordinates for the grid cell
            sum_f = np.sum(grid_cell)
            rowss, columns = grid_cell.shape
            x_bar = np.sum(np.array([[x] * rowss for x in range(rowss)]) * grid_cell) / sum_f if sum_f != 0 else 0
            y_bar = np.sum(np.array([[y] * columns for y in range(columns)]) * grid_cell) / sum_f if sum_f != 0 else 0

            # Store the centroid coordinates as features for this grid cell
            features.append(x_bar)
            features.append(y_bar)
    return features

#Extract chain code from contour using 8-connectivity


In [6]:
def extract_chain_code(contour):
    chain_code = []
    directions = [(0, 1), (-1, 1), (-1, 0), (-1, -1), (0, -1), (1, -1), (1, 0), (1, 1)]  # 8 Connectivity

    for i in range(len(contour) - 1):
        dx = contour[i+1][0][0] - contour[i][0][0]
        dy = contour[i+1][0][1] - contour[i][0][1]

        # Normalize dx and dy to be in range [-1, 1]
        dx_normalized = np.sign(dx)
        dy_normalized = np.sign(dy)

        direction = directions.index((dx_normalized, dy_normalized))
        chain_code.append(direction)

    return chain_code

#Calculate chain code

In [7]:
def calculate_chain_code_dimensions(image):
    # Threshold the image
    _, thresholded_image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)

    # Find contours
    contours, _ = cv2.findContours(thresholded_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Find the largest contour
    largest_contour = max(contours, key=cv2.contourArea)  # Processing the most significant part of the image

    # Get chain code sequence for the largest contour
    chain_code_sequence = extract_chain_code(largest_contour)

    return chain_code_sequence

In [8]:
# Calculate chain code dimensions for all images
chain_code_sequences = []
for image in X_train:
    sequence = calculate_chain_code_dimensions(image)
    chain_code_sequences.append(sequence)

In [9]:
# Pad chain codes with zeros to make them uniform
max_length = max(len(code) for code in chain_code_sequences)
chain_codes_padded = [code + [8] * (max_length - len(code)) for code in chain_code_sequences]

# kmeans

In [13]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def assign_labels(X, centroids):
    labels = []
    for point in X:
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        label = np.argmin(distances)
        labels.append(label)
    return np.array(labels)

def update_centroids(X, labels, k):
    centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        cluster_points = X[labels == i]
        if len(cluster_points) > 0:
            centroids[i] = np.mean(cluster_points, axis=0)
    return centroids

def kmeans(X, k, max_iters=100):
    centroids = X[np.random.choice(range(X.shape[0]), size=k, replace=False)]
    for _ in range(max_iters):
        labels = assign_labels(X, centroids)
        new_centroids = update_centroids(X, labels, k)
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    return centroids, labels


In [14]:
def count_samples_in_clusters(labels, k):
    counts = np.zeros(k, dtype=int)
    for label in labels:
        counts[label] += 1
    return counts

# Extract features

In [15]:
rows = 3
cols = 3
# Step 4: Extract features from images using centroid and chain code
centroid_features = np.array([extract_features(image, rows, cols) for image in X_train])
# chain_code_features = np.array([calculate_chain_code_dimensions(image) for image in images])
chain_code_features = np.array(chain_codes_padded)

In [16]:
centroid_features.shape

(56000, 18)

In [17]:
chain_code_features.shape

(56000, 73)

In [18]:
k = 10    #initialize number of clusters   Note:it's 10 clusters as we have 10 classses
centroids_centroid, labels_centroid = kmeans(centroid_features, k)
centroids_chain, labels_chain = kmeans(chain_code_features, k)

In [19]:
counts_centroid = count_samples_in_clusters(labels_centroid, k)
counts_chain = count_samples_in_clusters(labels_chain, k)

In [20]:
centroids_centroid.shape,labels_centroid.shape   # 10*18    70000*1

((10, 18), (56000,))

In [21]:
centroids_chain.shape,labels_chain.shape     # 10*73      70000*1

((10, 73), (56000,))

# Compare with actual distribution


In [22]:
# Compare with actual distribution
actual_counts = np.bincount(y_train)
print("Actual counts per class:", actual_counts)
print("Counts using centroid features:", counts_centroid)
print("Counts using chain code features:", counts_chain)

Actual counts per class: [5560 6277 5610 5708 5529 5040 5480 5790 5468 5538]
Counts using centroid features: [3746 9129 6619 2769 5418 5256 4087 7503 7588 3885]
Counts using chain code features: [5404 4458 5032 7327 4415 5354 6216 6057 4059 7678]


In [23]:
np.sum(actual_counts),np.sum(counts_chain),np.sum(counts_centroid)

(56000, 56000, 56000)

Adjusted Rand Index (ARI)


In [24]:
#The Adjusted Rand Index (ARI)

# Calculate ARI for centroid features
ari_centroid = adjusted_rand_score(y_train, labels_centroid)

# Calculate ARI for chain code features
ari_chain = adjusted_rand_score(y_train, labels_chain)

print("ARI for centroid features:", ari_centroid)
print("ARI for chain code features:", ari_chain)

ARI for centroid features: 0.1432602275345006
ARI for chain code features: 0.11069838606403704


In [25]:
from scipy.stats import mode

def map_cluster_labels(labels, true_labels):
    mapped_labels = np.zeros_like(labels)
    for i in range(k):
        mask = (labels == i)
        mapped_labels[mask] = mode(true_labels[mask])[0]
    return mapped_labels

def calculate_accuracy(predicted_labels, true_labels):
    correct = np.sum(predicted_labels == true_labels)
    total = len(true_labels)
    accuracy = correct / total
    return accuracy

# Map cluster labels to true labels
mapped_labels_centroid = map_cluster_labels(labels_centroid, y_train)
mapped_labels_chain = map_cluster_labels(labels_chain, y_train)

# Calculate accuracy
accuracy_centroid = calculate_accuracy(mapped_labels_centroid, y_train)
accuracy_chain = calculate_accuracy(mapped_labels_chain, y_train)

print("Accuracy using centroid features:", accuracy_centroid)
print("Accuracy using chain code features:", accuracy_chain)


Accuracy using centroid features: 0.3575357142857143
Accuracy using chain code features: 0.319875
