## Mixtures of Gaussians
### Assumptions about the Dustribution of the Latent Variable
### Jensen's Inequality
### Log-likelihood for the Incomplete and Complete Data
### Expectation Step
### Maximization Step
#### * With Respect to the Mean of the Distribution within Each Cluster
#### * With Respect to the Covariance Matrix of the Distribution within Each Cluster
#### * With Respect to the parameters of the Latent Variable
### Convergence Test
### Prediction


In [1]:
%matplotlib inline
import numpy as np 
import sklearn.preprocessing
import sklearn.datasets
import pandas as pd
import sklearn.model_selection
import numpy.random
import math
import sklearn.metrics
numpy.random.seed(42)

In [2]:
X, y = sklearn.datasets.load_iris(return_X_y=True)
#X, y = sklearn.datasets.load_wine(return_X_y=True)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42)
standard = sklearn.preprocessing.StandardScaler()
X_train = standard.fit_transform(X_train)
training_data = np.c_[X_train, y_train]#All of the features are continuous, so, no need to use one-hot encoder and we can directly standard normalize the features of the data set

X_test = standard.transform(X_test)
test_data = np.c_[X_test, y_test]
print(training_data.shape)
print(test_data.shape)
k = len(set(y_train))
y_train#It needs to be labeled from 0 to k


(112, 5)
(38, 5)


array([0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2, 1, 0, 2, 1, 0, 0, 0, 1,
       2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2, 1, 1, 2, 1, 0, 1, 2, 0,
       0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1, 0, 0, 2, 2, 0, 0, 0, 1,
       2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2, 1, 1, 1, 0, 1, 1, 0, 1,
       2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2, 1, 1, 2, 2, 0, 1, 2, 0,
       1, 2])

In [3]:
#To have better initialization of the algorithm
class kmeans(object):

    def __init__(self, X_train, k):
        self.K = k
        self.m = X_train.shape[0]
        self.n = X_train.shape[1]
        self.X_train = X_train
        choices = numpy.random.choice(np.arange(0, self.m), self.K, replace=False)
        self.centers = [X_train[choices[i], :].reshape(-1, 1) for i in range(0, self.K)]# initalize the clusters centers to be one of the observations
        self.clusters_assignments = np.zeros((self.m, 1))#Just to give it the necessary shape
    
    def reassign_clusters_centers(self):
        for k in range(0, self.K):
            temp = np.zeros((self.n, 1))
            clusters = list(map(lambda i: True if i == k else False, self.clusters_assignments))
            for i in range(0, self.m):
                if clusters[i] == True:
                    temp += clusters[i] * self.X_train[i, :].reshape(-1, 1)#clusters contained in {0, 1}
                else:
                    pass
            #print(np.sum(clusters))
            self.centers[k] = temp/np.sum(clusters)

    def distortiuon_function(self):
        temp = 0
        for i in range(0, self.m):
            for k in range(0, self.K):
                if self.clusters_assignments[i] == k:
                    temp += np.linalg.norm(self.X_train[i, :].reshape(-1, 1) - self.centers[k].reshape(-1, 1))**2
                    break#They willn't be assigned to more than one cluster in tandem
        return temp

    def assign_to_clusters(self, x):
        temp = []
        for k in range(0, self.K):
            temp.append(np.linalg.norm(x.reshape(-1, 1) - self.centers[k].reshape(-1, 1))**2)#We will use L2-norm for dissimilarity measure
        return np.argmin(temp)#return the cluster number

    def E_step(self):
        for i in range(0, self.m):
            self.clusters_assignments[i] = self.assign_to_clusters(X_train[i, :].reshape(-1, 1))
    
    def fit(self, max_iterations, eps=1e-5):
        self.E_step()#To initialize the clusters assignments
        past = 10
        future = 0
        count = 0
        while(abs(past - future) > eps):#I will care for only lack of progress because k-means will always be able to minimize the distortion functions
            print(f"count:{count}, max_iterations{max_iterations}, past:{past}, future:{future}")
            count += 1
            past = self.distortiuon_function()
            self.reassign_clusters_centers()#The M step
            self.E_step()
            future = self.distortiuon_function()

        return self.centers, self.clusters_assignments

    def prediction_dataset(self, X):
        predictions = []
        for i in range(0, X.shape[0]):
            predictions.append(self.assign_to_clusters(X[i, :].reshape(-1, 1)))
        return predictions

    def predict(self, x):
        return  self.assign_to_clusters(x.reshape(-1, 1))



In [4]:
class MixtureofGaussian(object):

    def __init__(self, X_train, k, randomly="random", max_iteration =1000):
        self.m = X_train.shape[0]
        self.n = X_train.shape[1]
        self.K = k
        self.means_of_each_clusters = list(map(lambda i: np.zeros((self.n, 1)), np.arange(0, self.K)))
        self.covariance_within_each_clusters = list(map(lambda i: np.zeros((self.n, self.n)), np.arange(0, self.K))) 
        self.parameters_of_hard_latent = [i for i in np.zeros((self.K, 1))]
        self.parameters_of_soft_latent =np.zeros((self.m, self.K))#Posterior
        self.X_train = X_train
        if randomly == "random":
            self.initialize_parameters_randomly()
        else:#This is if we want to initilaize it with k-means
            model = kmeans(self.X_train, self.K)
            centers, clusters  = model.fit(max_iterations=max_iteration)
            print("Finished the initialzation by the kmeans")
            for k in range(0, self.K):
                cluster_separated = list(map(lambda i: True if i == k else False, clusters))
                self.parameters_of_hard_latent[k] = np.sum(cluster_separated)/self.m
                self.means_of_each_clusters[k] = (centers[k]).reshape(-1, 1)
                #Calaculating the covariance matrix matrix within each cluster
                temp = np.zeros((self.n, self.n))
                for i in range(0, self.m):
                    if cluster_separated[i] == True:
                        temp += np.dot((self.X_train[i, :].reshape(-1, 1) - self.means_of_each_clusters[k].reshape(-1, 1)).reshape(-1, 1), (self.X_train[i, :].reshape(-1, 1) - self.means_of_each_clusters[k].reshape(-1, 1)).reshape(1, -1))
                
                self.covariance_within_each_clusters[k] = (1/np.sum(clusters)) * temp
                #print(self.covariance_within_each_clusters[k])

        
    def initialize_parameters_randomly(self):
        for k in range(0, self.K):
            self.means_of_each_clusters[k] = (numpy.random.randn(self.n)).reshape(-1, 1)
            self.covariance_within_each_clusters[k] = 25 * np.eye(self.n)
            self.parameters_of_hard_latent[k] = abs(numpy.random.randn())

        sums = np.sum(self.parameters_of_hard_latent)
        self.parameters_of_hard_latent = self.parameters_of_hard_latent/sums#To ensure that the parameters of the Multinomial distribution sums to 1

    def gaussian_distribution(self, chosen_class, x):
        t1 = np.divide(1, np.linalg.det(2* np.pi * self.covariance_within_each_clusters[chosen_class]))
        t2 = np.dot((x - self.means_of_each_clusters[chosen_class]).T, np.dot(np.linalg.pinv(self.covariance_within_each_clusters[chosen_class]), (x - self.means_of_each_clusters[chosen_class])))
        return t1 * np.exp((-1/2) * t2)

    def compute_soft_latent(self, chosen_class, x):
        denominator = 0 
        for k in range(0, self.K):
            denominator = denominator + ( self.gaussian_distribution(k, x) * self.parameters_of_hard_latent[k])
        #1e-50 were added to preven division by zero when computing the log-likelihood
        return (self.gaussian_distribution(chosen_class, x) * self.parameters_of_hard_latent[chosen_class] + 1e-50)/denominator
    
    def E_step(self):
        temp = np.zeros((1, self.K))
        for i in range(0, self.m):
            for k in range(0, self.K):
                #print(self.compute_soft_latent(k, self.X_train[i, :].reshape(-1, 1)).shape)
                temp[0, k] = self.compute_soft_latent(k, self.X_train[i, :].reshape(-1, 1)) 
            self.parameters_of_soft_latent[i, :] = temp[0, :]
    
    def M_step(self):
        for k in range(0, self.K):
            means_temp = np.zeros((self.n, 1))
            covariance_temp = np.zeros((self.n, self.n))

            for i in range(0, self.m):
                means_temp += (self.parameters_of_soft_latent[i, k] * self.X_train[i, :]).reshape(-1, 1)
                covariance_temp += (self.parameters_of_soft_latent[i, k] * np.multiply( (self.X_train[i, :].reshape(-1, 1) - self.means_of_each_clusters[k]), (self.X_train[i, :].reshape(-1, 1) - self.means_of_each_clusters[k]).T ) )

            self.means_of_each_clusters[k] = means_temp/np.sum(self.parameters_of_soft_latent[:, k])
            self.covariance_within_each_clusters[k] = covariance_temp/np.sum(self.parameters_of_soft_latent[:, k])
            self.parameters_of_hard_latent[k] = np.sum(self.parameters_of_soft_latent[:, k])/self.m
    
    def computing_log_likelihood(self):
        log_likelihood = 0
        for i in range(0, self.m):
            for k in range(0, self.K):
                #print(self.parameters_of_soft_latent)
                assert(self.parameters_of_soft_latent[i, k] > 0 )
                t1 = self.gaussian_distribution(k, self.X_train[i, :].reshape(-1, 1)) * self.parameters_of_hard_latent[k]
                #print(t1)
                #print( self.parameters_of_hard_latent[k])
                #print(self.parameters_of_soft_latent[i, k])
                #print((t1/self.parameters_of_soft_latent[i, k]))
                
                #assert(((t1/self.parameters_of_soft_latent[i, k]) > 0) == True)
                
                log_likelihood += self.parameters_of_soft_latent[i, k] * np.log(np.divide( t1 , self.parameters_of_soft_latent[i, k]))
        
        return log_likelihood

    def fit(self, max_iteration, eps=1e-5):
        convergence_test = True
        count = 0
        while( (convergence_test == True) and (count != max_iteration)):
            self.E_step()#Update the soft latent values            
            log_likelihood_t = self.computing_log_likelihood()
            self.M_step()#Update the parameters of the condtional distribution of x given z
            log_likelihood_t_future = self.computing_log_likelihood()
            #print(f"past:{log_likelihood_t}")
            #print(f"future:{log_likelihood_t_future}")
            print(f"Number of iteration:{count}, max_iteration:{max_iteration}, past:{log_likelihood_t}, future:{log_likelihood_t_future}")
            count = count + 1
            if( (log_likelihood_t_future - log_likelihood_t) < eps and (count > 10)):
                print("We converged to the optimal value for the log-likelihood")
                convergence_test =False #We reached the parameters that maximize the log-likelihood, no adancement in the log-likelihood
        
        return self.parameters_of_hard_latent, self.parameters_of_soft_latent, self.means_of_each_clusters, self.covariance_within_each_clusters

    def prediction_dataset(self, X):
        prediciton = []
        for i in range(0, X.shape[0]):
            prediciton.append(self.predict(X[i, :]))
            
        return np.array(prediciton)
    
    def predict(self, x):
        prediction = np.zeros((self.K, 1))
        for k in range(0, self.K):
            prediction[k] = self.compute_soft_latent(k, x.reshape(-1, 1))
        
        return np.argmax(prediction)




In [5]:
#Randomly Initialized
model = MixtureofGaussian(X_train, k, "random")
parameters_of_hard_latent, parameters_of_soft_latent, means_of_each_clusters, covariance_within_each_clusters = model.fit(1000)

Number of iteration:0, max_iteration:1000, past:[[-2282.83271147]], future:[[-939.38751629]]
Number of iteration:1, max_iteration:1000, past:[[-911.72578855]], future:[[-517.28563538]]
Number of iteration:2, max_iteration:1000, past:[[-515.63779951]], future:[[-505.18031714]]
Number of iteration:3, max_iteration:1000, past:[[-501.20815646]], future:[[-476.51994133]]
Number of iteration:4, max_iteration:1000, past:[[-470.03491342]], future:[[-427.82650092]]
Number of iteration:5, max_iteration:1000, past:[[-419.0620125]], future:[[-369.235186]]
Number of iteration:6, max_iteration:1000, past:[[-362.3048864]], future:[[-339.57270936]]
Number of iteration:7, max_iteration:1000, past:[[-336.57401125]], future:[[-321.63435204]]
Number of iteration:8, max_iteration:1000, past:[[-319.87255483]], future:[[-309.64422446]]
Number of iteration:9, max_iteration:1000, past:[[-308.04244442]], future:[[-299.02582969]]
Number of iteration:10, max_iteration:1000, past:[[-297.31029576]], future:[[-287.9

In [114]:
pred = model.prediction_dataset(X_train)
pred

array([1, 1, 0, 2, 2, 1, 1, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2,
       0, 1, 1, 1, 2, 1, 2, 2, 1, 2, 0, 1, 0, 0, 2, 2, 0, 2, 1, 2, 0, 1,
       1, 2, 2, 1, 0, 1, 1, 2, 2, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 1, 2,
       0, 1, 0, 0, 1, 2, 2, 0, 2, 0, 1, 0, 2, 0, 2, 2, 2, 1, 2, 2, 1, 2,
       2, 0, 1, 2, 0, 2, 1, 0, 1, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 2, 0, 1,
       2, 0], dtype=int64)

In [6]:
pred = model.prediction_dataset(X_train)
print("Performance on the training set")
#print(sklearn.metrics.confusion_matrix(y_train, pred))
c = sklearn.metrics.confusion_matrix(y_train, pred)
c[:, list(np.argmax(c, axis=1))]#ordering the cluster to where it shows the highest number of matching with the true labels

Performance on the training set


array([[35,  0,  0],
       [ 0, 38,  1],
       [ 0, 10, 28]], dtype=int64)

In [182]:
pred = model.prediction_dataset(X_test)
print("Performance on the test set")
#print(sklearn.metrics.confusion_matrix(y_test, pred))
c = sklearn.metrics.confusion_matrix(y_test, pred)
c[:, list(np.argmax(c, axis=1))]#ordering the cluster to where it shows the highest number of matching with the true labels

Performance on the test set


array([[15,  0,  0],
       [ 2,  9,  0],
       [ 0,  2, 10]], dtype=int64)

In [177]:
#K-means Initialized
model = MixtureofGaussian(X_train, k, "kmeans")
parameters_of_hard_latent, parameters_of_soft_latent, means_of_each_clusters, covariance_within_each_clusters = model.fit(1000)

count:0, max_iterations1000, past:10, future:0
count:1, max_iterations1000, past:565.374081063309, future:231.49457602906614
count:2, max_iterations1000, past:231.49457602906614, future:128.5139214635211
count:3, max_iterations1000, past:128.5139214635211, future:109.6719256474805
count:4, max_iterations1000, past:109.6719256474805, future:108.97247599145511
count:5, max_iterations1000, past:108.97247599145511, future:107.34494618026055
count:6, max_iterations1000, past:107.34494618026055, future:106.2528195193156
count:7, max_iterations1000, past:106.2528195193156, future:106.19859005755436
Finished the initialzation by the kmeans
[[0.07968003 0.0288702  0.04693592 0.02816197]
 [0.0288702  0.10989086 0.02636605 0.03010616]
 [0.04693592 0.02636605 0.05799119 0.04965981]
 [0.02816197 0.03010616 0.04965981 0.05964532]]
[[0.0460743  0.068189   0.00225058 0.0033825 ]
 [0.068189   0.16917547 0.0038792  0.00594619]
 [0.00225058 0.0038792  0.00294652 0.00160053]
 [0.0033825  0.00594619 0.0016

In [178]:
pred = model.prediction_dataset(X_train)
print("Performance on the training set")
#print(sklearn.metrics.confusion_matrix(y_train, pred))
c = sklearn.metrics.confusion_matrix(y_train, pred)
c[:, list(np.argmax(c, axis=1))]#ordering the cluster to where it shows the highest number of matching with the true labels

Performance on the training set


array([[35,  0,  0],
       [ 0, 38,  1],
       [ 0,  4, 34]], dtype=int64)

In [179]:
pred = model.prediction_dataset(X_test)
print("Performance on the test set")
#print(sklearn.metrics.confusion_matrix(y_test, pred))
c = sklearn.metrics.confusion_matrix(y_test, pred)
c[:, list(np.argmax(c, axis=1))]#ordering the cluster to where it shows the highest number of matching with the true labels

Performance on the test set


array([[15,  0,  0],
       [ 0, 11,  0],
       [ 0,  1, 11]], dtype=int64)

### References 
* Chapter 1, chapter 2 and Chapter 9 from Bishop, C. (2006). Pattern Recognition and Machine Learning. Cambridge: Springer.
* Andrew Ng, Lec 12: (https://www.youtube.com/watch?v=ZZGTuAkF-Hw)
* Andrew Ng, Lec 13: (https://www.youtube.com/watch?v=LBtuYU-HfUg)
