# **DMML ASSIGNMENT 3**
## **TASK 2. Overhead MNIST dataset**

Name: Alena Maria Thomas, Ananya Kaushal

Roll No: MDS202303, MDS202306

### 1. Import required packages.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import accuracy_score

### 2. Load the dataset.

In [2]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [3]:
train_df.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,8,86,91,98,97,96,89,76,84,87,...,76,81,80,72,77,83,71,44,53,33
1,3,128,133,129,119,114,128,141,131,133,...,137,129,139,161,128,137,131,125,127,135
2,0,138,117,84,52,38,62,86,75,101,...,63,99,109,83,61,54,75,98,107,108
3,6,105,108,108,109,111,112,107,104,106,...,120,115,105,89,87,98,92,95,89,84
4,5,176,143,109,155,151,149,145,152,156,...,92,85,82,91,80,80,79,79,82,85


Separate the features and the labels.

In [4]:
X_train = train_df[train_df.columns[1:]]
y_train = train_df['label'].copy()

X_test = test_df[test_df.columns[1:]]
y_test = test_df['label'].copy()

In [5]:
print("original shape of X_train:", X_train.shape)
print("original shape of X_test:", X_test.shape)

original shape of X_train: (8519, 784)
original shape of X_test: (1065, 784)


### 3. Preprocess the dataset.

Normalize the data i.e. scaling the pixel intensities down to $[0, 1]$ and convert to floats, by dividing by 255.

In [6]:
X_train = X_train / 255.
X_test = X_test / 255.

### 4. Define function to perform K-Means Clustering to obtain a small representative labeled subset.

In [7]:
def KMeans_seed_set(X, y, k):
  # Fit KMeans model
  kmeans = KMeans(n_clusters=k, random_state=100)
  kmeans.fit(X)

  # Obtain the centroids and clusters
  centroids = kmeans.cluster_centers_
  cluster_labels = kmeans.labels_

  # initialize a numpy arrays to store the seed set.
  seed_set_X = np.zeros((k, X.shape[1]))
  seed_set_y =  np.zeros(k, dtype='int64')

  # for each cluster, find the nearest image to the centroid
  for i in range(k):
    # Get all features in this cluster i
    cluster_features = X_train[cluster_labels == i]

    # Compute the Euclidean distance to the centroid
    distances = euclidean_distances(cluster_features, [centroids[i]])

    # Find index of nearest image
    nearest_idx = np.argmin(distances)

    # Find the corresponding data point and label
    datapoint = cluster_features.iloc[nearest_idx]
    idx = np.where((X_train == datapoint).all(axis=1))[0]
    label = y_train.iloc[idx]

    # Add this datapoint to the seed set.
    seed_set_X[i] = datapoint
    seed_set_y[i] = label

  return (seed_set_X, seed_set_y)

### 5. Define functions to build and train the MLP model on the representative set.

In [8]:
# baseline OMNIST MLP model
def create_mlp_model():
  model = MLPClassifier(max_iter=200, random_state=100)
  return model

In [9]:
def train_mlp_model(model, X, y):
  # one-hot encode the target variable
  new_y = pd.get_dummies(y)

  # train the MLP model
  model.fit(X, new_y)

  # Predict the training set
  y_pred = model.predict(X_train)

  # Calculate training accuracy
  accuracy = round(accuracy_score(new_y, y_pred), 4)

  return accuracy

### 6. Compare the accuracy of the trained MLP model for different values of $K$.

In [10]:
k_values = [10, 20, 30, 40, 50]

In [11]:
training_accuracy = {}
models = {}

for k in k_values:
  # obtain seed set after applying KMeans clustering
  seed_X, seed_y = KMeans_seed_set(X_train, y_train, k)

  # Build an MLP model
  model = create_mlp_model()

  # Train an MLP model using this set
  acc = train_mlp_model(model, X_train, y_train)

  training_accuracy[k] = acc
  models[k] = model

  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] = label
  seed_set_y[i] 

In [12]:
train_accuracy_df = pd.DataFrame()
train_accuracy_df['K'] = training_accuracy.keys()
train_accuracy_df['Training Accuracy'] = training_accuracy.values()
train_accuracy_df

Unnamed: 0,K,Training Accuracy
0,10,0.7264
1,20,0.7264
2,30,0.7264
3,40,0.7264
4,50,0.7264


### 7. Model Evaluation

In [13]:
# One hot encode the test target variable
new_y_test = pd.get_dummies(y_test)

In [14]:
# testing accuracy
testing_accuracy = {}

for k in k_values:
  y_pred = models[k].predict(X_test)
  testing_accuracy[k] = round(accuracy_score(new_y_test, y_pred), 4)

In [15]:
test_accuracy_df = pd.DataFrame()
test_accuracy_df['K'] = testing_accuracy.keys()
test_accuracy_df['Testing Accuracy'] = testing_accuracy.values()
test_accuracy_df

Unnamed: 0,K,Testing Accuracy
0,10,0.4329
1,20,0.4329
2,30,0.4329
3,40,0.4329
4,50,0.4329
