## <font color=red> You should not import any new libraries. Your code should run with python=3.x</font>

- Your solutions will be auto-graded. Hence we request you to follow the instructions.
- Modify the code only between

```
## TODO
## END TODO
```

- In addition to above changes, you can play with arguments to the functions for generating plots
- We will run the auto grading scripts with private test data


In [None]:
import numpy as np
from matplotlib import pyplot as plt
import math


# KMeans Algorithm


In [None]:
class Kmeans:
    """
    Implementing Kmeans algorithm.

    You may choose to define additional helper function within this class, however don't change the prototype of `fit` and `predict` classes.   
    """

    def __init__(self, n_clusters, random_state):
        """
        Implement of KMeans algorithm

        Args:
        n_clusters   : int - no. of clusters
        random_state : int - changing this should change initial cluster centers
        """

        self.n_clusters = n_clusters
        self.random_state = random_state


    def fit(self, X):
        """
        Implement of KMeans algorithm and stores cluster centers as class object

        Args:
        X : numpy array of shape (n_samples, 2)

        Returns:
        Nothing
        """

        ## TODO
        n_samples = X.shape[0]
        np.random.seed(self.random_state)
        self.centroids = np.array(X[np.random.choice(n_samples, self.n_clusters, replace=False)])
        Y = np.zeros((n_samples, ))
        
        for i in range(100):
            new_Y = self.predict(X)

            if (new_Y == Y).all():
                break

            Y = new_Y
            for k in range(n_samples):
                temp = X[Y==k]
                if len(temp):
                    self.centroids[k] = np.mean(X[Y==k], axis=0)

        ## END TODO


    def predict(self, X):
        """
        Uses the stored cluster centers from fit to predict labels.
        Args:
        X : numpy array of shape (n_samples, 2)

        Returns:
        Y : numpy array of shape (n_samples, ) - labels for each of the data point in X
        """
        
        ## TODO
        Y = np.argmin(self.distance(X), axis=1)
        
        ## END TODO
        
        assert Y.shape == (X.shape[0],), "Return Y in expected format"
        return Y
    

    def distance(self, X):
        diff = np.repeat(X.reshape((X.shape[0], 1, 2)), self.n_clusters, axis=1) \
            - self.centroids.reshape(1, self.n_clusters, 2)
        return np.sum(diff**2, axis=2)


## LOAD DATA 1


In [None]:
X = np.load("./data/train_X_1.npy")
Y = np.array([np.load("./data/train_Y_1.npy")]).T
Y = np.squeeze(Y)

plt.scatter(X[:, 0], X[:, 1], c=Y)
plt.figure()

km = Kmeans(4, 123)
km.fit(X)
y_pred = km.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)


## LOAD DATA 2


In [None]:
X = np.load("./data/train_X_2.npy")
Y = np.array([np.load("./data/train_Y_2.npy")]).T
Y = np.squeeze(Y)

plt.scatter(X[:, 0], X[:, 1], c=Y)
plt.figure()

km = Kmeans(2, 123)
km.fit(X)
y_pred = km.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)


## LOAD DATA 3


In [None]:
X = np.load("./data/train_X_3.npy")
Y = np.array([np.load("./data/train_Y_3.npy")]).T
Y = np.squeeze(Y)

plt.scatter(X[:, 0], X[:, 1], c=Y)
plt.figure()

km = Kmeans(2, 123)
km.fit(X)
y_pred = km.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)


# Simple Kernel Design


In [None]:
class Kmeans_Kernel:
    """
    Implementing Kmeans algorithm.

    You may choose to define additional helper function within this class, however don't change the protype of `fit` and `predict` classes.
    """

    def __init__(self, n_clusters, random_state):
        """
        Implement of kMeans algorithm

        Args:
        n_clusters: int - no. of clusters
        random_state: int - changing this should change initial cluster centers
        """

        self.n_clusters = n_clusters
        self.random_state = random_state


    def make_zero_centered(self, X):
        """
        Returns a zero-centered concentric circles Dataset

        Args:
        X : numpy array of shape (n_samples, 2)

        Returns:
        X : numpy array of shape (n_samples, 2)
        """

        ## TODO
        self.center = X.mean()
        X = np.array(X - X.mean())
        
        ## END TODO

        return X


    def fit(self, X):
        """
        Implement of kMeans algorithm along with using a suitable kernel and stores cluster centers as class object

        Args:
        X : numpy array of shape (n_samples, 2)

        Returns:
        Nothing
        """

        ## TODO
        X = self.make_zero_centered(X)
        n_samples = X.shape[0]
        np.random.seed(self.random_state)
        phi = np.sqrt(np.sum(X**2, axis=1))
        self.phi_centroids = np.array(phi[np.random.choice(n_samples, self.n_clusters, replace=False)])
        Y = np.zeros((n_samples, ))
        
        for i in range(100):
            new_Y = self.predict(X)

            if (new_Y == Y).all():
                break

            Y = new_Y
            for k in range(n_samples):
                temp = X[Y==k]
                if len(temp):
                    self.phi_centroids[k] = np.mean(phi[Y==k], axis=0)
        
        ## END TODO


    def predict(self, X):
        """
        Uses the stored cluster centers from fit to predict labels using the kernel.
  
        Args:
        X : numpy array of shape (n_samples, 2)

        Returns:
        Y : Labels for each of the data point in X.
        """

        ## TODO
        X = np.array(X - self.center)
        Y = np.argmin(self.distance(X), axis=1)

        ## END TODO

        assert Y.shape == (X.shape[0],), "Return Y in expected format"
        return Y
    

    def distance(self, X):
        X_ = np.repeat(X.reshape((X.shape[0], 1, 2)), self.n_clusters, axis=1)
        phi_X = np.sqrt(np.sum(X_**2, axis=2))
        
        return phi_X**2 + self.phi_centroids**2 - 2*np.multiply(phi_X, self.phi_centroids)


X = np.load("./data/train_X_3.npy")
X = X + np.ones_like(X)*0.5
Y = np.array([np.load("./data/train_Y_3.npy")]).T
Y = np.squeeze(Y)

plt.scatter(X[:, 0], X[:, 1], c=Y)
plt.figure()

km = Kmeans_Kernel(2, 123)
km.fit(X)
y_pred = km.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
