# **DMML ASSIGNMENT 3**
## **TASK 1. Fashion MNIST dataset**

Name: Alena Maria Thomas, Ananya Kaushal

Roll No: MDS202303, MDS202306

### 1. Import required packages.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from keras.callbacks import EarlyStopping
from keras.datasets import fashion_mnist
from keras.layers import Flatten, Dense, Dropout
from keras.models import Sequential
from keras.utils import to_categorical

from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

### 2. Load the dataset.

In [2]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

In [3]:
print("original shape of X_train:", X_train.shape)
print("original shape of X_test:", X_test.shape)

original shape of X_train: (60000, 28, 28)
original shape of X_test: (10000, 28, 28)


### 3. Preprocess the dataset.

The training set contains 60,000 grayscale images, each 28x28 pixels.

Each pixel intensity is represented as a byte (0 to 255):

In [4]:
X_train.dtype

dtype('uint8')

Therefore, normalize the data i.e. scaling the pixel intensities down to $[0, 1]$ and convert to floats, by dividing by 255.

In [5]:
X_train = X_train / 255.
X_test = X_test / 255.

### 4. Define function to perform K-Means Clustering to obtain a small representative labeled subset.

In [6]:
def KMeans_seed_set(X, y, k):
  # Flatten the images for clustering
  x_train = X.reshape(X.shape[0], -1)

  # Fit KMeans model
  kmeans = KMeans(n_clusters=k, random_state=100)
  kmeans.fit(x_train)

  # Obtain the centroids and clusters
  centroids = kmeans.cluster_centers_
  cluster_labels = kmeans.labels_

  # initialize a numpy arrays to store the seed set.
  seed_set_X = np.zeros((k, X.shape[1], X.shape[1]))
  seed_set_y =  np.zeros(k)

  # for each cluster, find the nearest image to the centroid
  for i in range(k):
    # Get all features in this cluster i
    cluster_features = x_train[cluster_labels == i]

    # Compute the Euclidean distance to the centroid
    distances = euclidean_distances(cluster_features, [centroids[i]])

    # Find index of nearest image
    nearest_idx = np.argmin(distances)

    # Find the corresponding data point and label
    datapoint = X[cluster_labels == i][nearest_idx]
    idx = np.where((X == datapoint).all(axis=1))[0]
    label = y[idx][0]

    # Add this datapoint to the seed set.
    seed_set_X[i] = datapoint
    seed_set_y[i] = label

  return (seed_set_X, seed_set_y)

### 5. Define functions to build and train the MLP model on the representative set.

In [7]:
#create a simple MLP model
def create_mlp_model():
  tf.keras.backend.clear_session() # to reset the name counters
  tf.random.set_seed(100)

  model = Sequential([
      Flatten(input_shape=[28,28]),
      Dense(512, activation='relu'),
      Dropout(0.2),
      Dense(100, activation='relu'),
      Dropout(0.2),
      Dense(10, activation='softmax')
  ])
  model.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
  )
  return model

In [8]:
def train_mlp_model(model, X, y):
  # one-hot encode the target variable
  new_y = to_categorical(y)

  # Set up early stopping to prevent overfitting
  early_stop = EarlyStopping(monitor='loss', mode='min', verbose=1, patience=5)

  # train the MLP model
  history = model.fit(X, new_y, batch_size=128, epochs=25, verbose=1,
                      callbacks=[early_stop], shuffle=True)

  return history

### 6. Compare the accuracy of the trained MLP model for different values of $K$.

In [9]:
k_values = [10, 20, 30, 40, 50]

In [10]:
training_accuracy = {}
models = {}

for k in k_values:
  # obtain seed set after applying KMeans clustering
  seed_X, seed_y = KMeans_seed_set(X_train, y_train, k)

  # Build an MLP model
  model = create_mlp_model()

  # Train an MLP model using this set
  history = train_mlp_model(model, X_train, y_train)

  training_accuracy[k] = round(history.history['accuracy'][-1], 4)

  models[k] = model



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25




Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25




Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25




Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25




Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [11]:
train_accuracy_df = pd.DataFrame()
train_accuracy_df['K'] = training_accuracy.keys()
train_accuracy_df['Training Accuracy'] = training_accuracy.values()
train_accuracy_df

Unnamed: 0,K,Training Accuracy
0,10,0.9234
1,20,0.9231
2,30,0.9228
3,40,0.924
4,50,0.9237


### 7. Model Evaluation

In [14]:
# One hot encode the test target variable
new_y_test = to_categorical(y_test)

In [15]:
# testing accuracy
testing_accuracy = {}

for k in k_values:
  accuracy = models[k].evaluate(X_test, new_y_test, batch_size=128)[1]
  testing_accuracy[k] = round(accuracy, 4)



In [16]:
test_accuracy_df = pd.DataFrame()
test_accuracy_df['K'] = testing_accuracy.keys()
test_accuracy_df['Testing Accuracy'] = testing_accuracy.values()
test_accuracy_df

Unnamed: 0,K,Testing Accuracy
0,10,0.8962
1,20,0.8949
2,30,0.8954
3,40,0.8941
4,50,0.8939
