## Multivariate Student-t Distribution

### Assuming the degrees of freedom to be known in advance
#### Assumption about the continuous Latent Variable and its conditional Distribution, and the Discrete Latent Variable and its conditional Distribution
#### E-step 
#### M-step
#### Convergence Test

### Assuming the degrees of freedom to be unknown in advance
####  Assumption about the continuous Latent Variable and its conditional Distribution, and the Discrete Latent Variable and its conditional Distribution
#### Multicycle ECM
##### E-step
##### First CM 
##### Second CM-step
#### Convergence Test

------------------------
I only implemented the model with the Assumption that the degree of freedom is known.


In [1]:
%matplotlib inline
import numpy as np 
import sklearn.preprocessing
import sklearn.datasets
import pandas as pd
import sklearn.model_selection
import numpy.random
import math
import sklearn.metrics
import scipy.stats
import scipy.special
import matplotlib.pyplot as plt


In [2]:
#To have better initialization of the algorithm
class kmeans(object):

    def __init__(self, X_train, k):
        self.K = k
        self.m = X_train.shape[0]
        self.n = X_train.shape[1]
        self.X_train = X_train
        choices = numpy.random.choice(np.arange(0, self.m), self.K, replace=False)
        self.centers = [X_train[choices[i], :].reshape(-1, 1) for i in range(0, self.K)]# initalize the clusters centers to be one of the observations
        self.clusters_assignments = np.zeros((self.m, 1))#Just to give it the necessary shape
    
    def reassign_clusters_centers(self):
        for k in range(0, self.K):
            temp = np.zeros((self.n, 1))
            clusters = list(map(lambda i: True if i == k else False, self.clusters_assignments))
            for i in range(0, self.m):
                if clusters[i] == True:
                    temp += clusters[i] * self.X_train[i, :].reshape(-1, 1)#clusters contained in {0, 1}
                else:
                    pass
            #print(np.sum(clusters))
            self.centers[k] = temp/np.sum(clusters)

    def distortiuon_function(self):
        temp = 0
        for i in range(0, self.m):
            for k in range(0, self.K):
                if self.clusters_assignments[i] == k:
                    temp += np.linalg.norm(self.X_train[i, :].reshape(-1, 1) - self.centers[k].reshape(-1, 1))**2
                    break#They willn't be assigned to more than one cluster in tandem
        return temp

    def assign_to_clusters(self, x):
        temp = []
        for k in range(0, self.K):
            temp.append(np.linalg.norm(x.reshape(-1, 1) - self.centers[k].reshape(-1, 1))**2)#We will use L2-norm for dissimilarity measure
        return np.argmin(temp)#return the cluster number

    def E_step(self):
        for i in range(0, self.m):
            self.clusters_assignments[i] = self.assign_to_clusters(X_train[i, :].reshape(-1, 1))
    
    def fit(self, max_iterations, eps=1e-5):
        self.E_step()#To initialize the clusters assignments
        past = 10
        future = 0
        count = 0
        while(abs(past - future) > eps):#I will care for only lack of progress because k-means will always be able to minimize the distortion functions
            print(f"count:{count}, max_iterations{max_iterations}, past:{past}, future:{future}")
            count += 1
            past = self.distortiuon_function()
            self.reassign_clusters_centers()#The M step
            self.E_step()
            future = self.distortiuon_function()

        return self.centers, self.clusters_assignments

    def prediction_dataset(self, X):
        predictions = []
        for i in range(0, X.shape[0]):
            predictions.append(self.assign_to_clusters(X[i, :].reshape(-1, 1)))
        return predictions

    def predict(self, x):
        return  self.assign_to_clusters(x.reshape(-1, 1))



In [3]:
class Mixtures_ofMultiVariate_Student_t_model(object):

    def __init__(self, X_train, G, df, randomly="random", max_iteration =1000):
        self.G = G
        if (all(int(c)>0 for c in df)) and (len(df) == self.G):
            self.df = df#Will be a list
        else: 
            print("Wrong df value because if you view gamma function as a factorial, negative value of n wouldn't exist")
            raise Exception("wrong df")
        self.m = X_train.shape[0]
        self.n = X_train.shape[1]
        self.means_of_each_clusters = list(map(lambda i: np.zeros((self.n, 1)), np.arange(0, self.G)))
        self.covariance_within_each_clusters = list(map(lambda i: np.zeros((self.n, self.n)), np.arange(0, self.G))) 
        self.parameters_of_mixing_latent_parameter = [i for i in np.zeros((self.G, 1))]
        self.parameters_of_mixing_soft_latent = np.zeros((self.m, self.G))#Posterior for clusters
        self.continuous_latent_variable_soft = np.zeros((self.m, self.G))#Posterior of the conjugate prior

        self.X_train = X_train
        if randomly == "random":
            self.initialize_parameters_randomly()

        else:#Kmeans initialization
            model = kmeans(self.X_train, self.G)
            centers, clusters  = model.fit(max_iterations=max_iteration)
            print("Finished the initialization by the kmeans")
            for k in range(0, self.G):
                cluster_separated = list(map(lambda i: True if i == k else False, clusters))
                self.parameters_of_mixing_latent_parameter[k] = np.sum(cluster_separated)/self.m
                self.means_of_each_clusters[k] = (centers[k]).reshape(-1, 1)
                #Calaculating the covariance matrix matrix within each cluster
                temp = np.zeros((self.n, self.n))
                for i in range(0, self.m):
                    if cluster_separated[i] == True:
                        temp += np.dot((self.X_train[i, :].reshape(-1, 1) - self.means_of_each_clusters[k].reshape(-1, 1)).reshape(-1, 1), (self.X_train[i, :].reshape(-1, 1) - self.means_of_each_clusters[k].reshape(-1, 1)).reshape(1, -1))
                
                self.covariance_within_each_clusters[k] = (1/np.sum(clusters)) * temp
                #print(self.covariance_within_each_clusters[k])

    def initialize_parameters_randomly(self):
        for g in range(0, self.G):
            self.means_of_each_clusters[g] = (numpy.random.randn(self.n)).reshape(-1, 1)
            c = np.random.randn(self.n, self.n)
            self.covariance_within_each_clusters[g] = 25 * np.dot(c, c.T)
            self.parameters_of_mixing_latent_parameter[g] = abs(numpy.random.randn())

        sums = np.sum(self.parameters_of_mixing_latent_parameter)
        self.parameters_of_mixing_latent_parameter = self.parameters_of_mixing_latent_parameter/sums#To ensure that the parameters of the Multinomial distribution sums to 1

    def Mahalanobis_distance(self, x, g):
        return np.dot(np.transpose(x.reshape(-1, 1) - self.means_of_each_clusters[g].reshape(-1, 1)), np.dot(np.linalg.inv(self.covariance_within_each_clusters[g]), x.reshape(-1, 1) - self.means_of_each_clusters[g].reshape(-1, 1)))

    def MultiVariate_Student_t_PDF(self, x, g):
        det = np.linalg.det(self.covariance_within_each_clusters[g])
        #print(det)
        return (
            (scipy.special.gamma((self.df[g] + self.n)/2)/scipy.special.gamma(self.df[g] /2) ) * (1/(np.pi * self.df[g])) * 
        (1/np.sqrt(det)) * 
        (1/(1 + (self.Mahalanobis_distance(x, g)/ self.df[g]) )**((self.df[g]+self.n)/2) )
            )

    def compute_continuous_latent(self, x, g):
        return (self.df[g] + self.n)/(self.df[g] + self.Mahalanobis_distance(x, g))
    
    def compute_discrete_latent(self, x, g):
        denominator = 0 
        for k in range(0, self.G):
            denominator = denominator + ( self.MultiVariate_Student_t_PDF(x, k) * self.parameters_of_mixing_latent_parameter[k])
        #1e-50 were added to preven division by zero when computing the log-likelihood
        return (self.MultiVariate_Student_t_PDF(x, g) * self.parameters_of_mixing_latent_parameter[g])/denominator
    
    def E_step(self):
        self.continuous_latent_variable_soft = np.array(list(map(lambda x: np.array([self.compute_continuous_latent(x, g) for g in range(0, self.G)]).reshape(1, -1), self.X_train))).reshape(self.m, self.G)
        #print(self.continuous_latent_variable_soft.shape)
        self.parameters_of_mixing_soft_latent = np.array(list(map(lambda x: np.array([self.compute_discrete_latent(x, g) for g in range(0, self.G)]).reshape(1, -1), self.X_train))).reshape(self.m, self.G)
    
    def M_step(self):
        for g in range(0, self.G):
            self.parameters_of_mixing_latent_parameter[g] = (1/self.m) * np.sum(self.parameters_of_mixing_soft_latent[:, g])
            #print((self.continuous_latent_variable_soft[:, g].reshape(-1, 1) * self.parameters_of_mixing_soft_latent[:, g].reshape(-1, 1)).shape)

            temp = np.zeros((self.n, 1))
            for i in range(0, self.m):
                temp += (self.continuous_latent_variable_soft[i, g] * self.parameters_of_mixing_soft_latent[i, g] *self.X_train[i, :]).reshape(-1, 1)
            temp = (1/np.sum(np.multiply(self.continuous_latent_variable_soft[:, g].reshape(-1, 1), self.parameters_of_mixing_soft_latent[:, g].reshape(-1, 1)) ) ) * temp
            self.means_of_each_clusters[g] = temp
            #self.means_of_each_clusters[g] =(1/np.sum(np.multiply(self.continuous_latent_variable_soft[:, g].reshape(-1, 1), self.parameters_of_mixing_soft_latent[:, g].reshape(-1, 1)) ) ) * np.sum(self.continuous_latent_variable_soft[:, g].reshape(-1, 1) * self.parameters_of_mixing_soft_latent[:, g].reshape(-1, 1) * self.X_train.T, axis=0).reshape(-1, 1)

            for i in range(0, self.m):
                self.covariance_within_each_clusters[g] += self.continuous_latent_variable_soft[i, g] * self.parameters_of_mixing_soft_latent[i, g] * np.dot((self.X_train[i, :].reshape(-1, 1) - self.means_of_each_clusters[g].reshape(-1, 1)),(self.X_train[i, :].reshape(-1, 1) - self.means_of_each_clusters[g].reshape(-1, 1)).T)
            
            self.covariance_within_each_clusters[g] /= np.sum(self.parameters_of_mixing_soft_latent[:, g])
    
    def compute_log_likelihood(self):
        temp = 0
        for i in range(0, self.m):
            temp2 = 0
            for g in range(0, self.G):
                temp2 += self.parameters_of_mixing_latent_parameter[g] * self.MultiVariate_Student_t_PDF(self.X_train[i, :], g)
            #if temp2 <=0:
                #print(temp2)
            #assert(temp2 > 0)
            #print(temp2)
            temp += np.log(temp2)
        
        return temp

    def fit(self, max_iteration, eps=1e-3):
        convergence_test = True
        count = 0
        while( (convergence_test == True) and (count != max_iteration)):
            self.E_step()#Update the soft latent values
            m_means, m_cov, m_mixing = (self.means_of_each_clusters.copy(), self.covariance_within_each_clusters.copy(), self.parameters_of_mixing_latent_parameter.copy())            
            log_likelihood_t = self.compute_log_likelihood()
            self.M_step()#Update the parameters of the conditional distribution of x given z and u
            log_likelihood_t_future = self.compute_log_likelihood()
            print(f"Number of iteration:{count}, max_iteration:{max_iteration}, past:{log_likelihood_t}, future:{log_likelihood_t_future}")
            count = count + 1
            if log_likelihood_t_future != log_likelihood_t_future:#The usual trick nan doesn't equal itself
                self.means_of_each_clusters, self.covariance_within_each_clusters, self.parameters_of_mixing_latent_parameter = (m_means, m_cov, m_mixing)
                print("Something wrong happened in the Maximization step")
                break
            #print(log_likelihood_t_future[0])
            if( (log_likelihood_t_future - log_likelihood_t) < eps and (count > 10)):
                print("We converged to the optimal value for the log-likelihood")
                convergence_test =False #We reached the parameters that maximize the log-likelihood, no adancement in the log-likelihood
        
        return self.means_of_each_clusters, self.covariance_within_each_clusters, self.parameters_of_mixing_latent_parameter

    def prediction_dataset(self, X):
        prediciton = []
        for i in range(0, X.shape[0]):
            prediciton.append(self.predict(X[i, :]))
            
        return np.array(prediciton)
    
    def predict(self, x):
        prediction = np.zeros((self.G, 1))
        for g in range(0, self.G):
            prediction[g] = self.compute_discrete_latent(x.reshape(-1, 1), g)
        
        return np.argmax(prediction)



In [23]:
numpy.random.seed(120)

#Using IRIS and Wine Dataset
#X, y = sklearn.datasets.load_iris(return_X_y=True)
X, y = sklearn.datasets.load_wine(return_X_y=True)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42)
#standard = sklearn.preprocessing.StandardScaler()
#X_train = standard.fit_transform(X_train)
training_data = np.c_[X_train, y_train]#All of the features are continuous, so, no need to use one-hot encoder and we can directly standard normalize the features of the data set

#X_test = standard.transform(X_test)
test_data = np.c_[X_test, y_test]
print(training_data.shape)
print(test_data.shape)
k = len(set(y_train))
y_train#It needs to be labeled from 0 to k


(133, 14)
(45, 14)


array([0, 1, 1, 2, 0, 1, 0, 0, 2, 2, 1, 1, 0, 1, 0, 2, 1, 1, 2, 0, 0, 0,
       2, 0, 0, 1, 2, 1, 0, 2, 1, 0, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 2, 1,
       1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 2, 2, 1, 1, 0, 1, 2, 2, 1, 2, 1, 1,
       1, 0, 0, 2, 0, 2, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 2, 1, 1, 1, 2, 2,
       1, 0, 0, 1, 2, 2, 0, 1, 2, 2, 2, 2, 1, 0, 1, 0, 2, 0, 0, 1, 0, 0,
       2, 1, 0, 2, 2, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 0, 1,
       1])

In [24]:
print(k)


3


In [25]:
#Randomly Initialized
df = [10, 10, 10]
model = Mixtures_ofMultiVariate_Student_t_model(X_train, k, df, "kmeans", 10)
means_of_each_clusters, covariance_within_each_clusters, parameters_of_mixing_latent_parameter = model.fit(1000)

count:0, max_iterations10, past:10, future:0
count:1, max_iterations10, past:10792010.491995996, future:3102217.0489981584
count:2, max_iterations10, past:3102217.0489981584, future:1978365.5976835634
count:3, max_iterations10, past:1978365.5976835634, future:1795178.7694889628
count:4, max_iterations10, past:1795178.7694889628, future:1702081.6684963438
count:5, max_iterations10, past:1702081.6684963438, future:1633691.812665524
count:6, max_iterations10, past:1633691.812665524, future:1616124.000250265
count:7, max_iterations10, past:1616124.000250265, future:1615357.3439364806
Finished the initialzation by the kmeans
Number of iteration:0, max_iteration:1000, past:[[101.20972741]], future:[[239.08935794]]
Number of iteration:1, max_iteration:1000, past:[[239.08935794]], future:[[305.70311323]]
Number of iteration:2, max_iteration:1000, past:[[305.70311323]], future:[[335.73446949]]
Number of iteration:3, max_iteration:1000, past:[[335.73446949]], future:[[351.79857503]]
Number of it

In [27]:
pred = model.prediction_dataset(X_train)
print("Performance on the training set")
#print(sklearn.metrics.confusion_matrix(y_train, pred))
c = sklearn.metrics.confusion_matrix(y_train, pred)
c = c[:, list(np.argmax(c, axis=1))]#ordering the cluster to where it shows the highest number of matching with the true labels
c

Performance on the training set


array([[39,  0,  5],
       [ 2, 41, 10],
       [ 0, 17, 19]], dtype=int64)

In [28]:
pred = model.prediction_dataset(X_test)
print("Performance on the test set")
#print(sklearn.metrics.confusion_matrix(y_test, pred))
c = sklearn.metrics.confusion_matrix(y_test, pred)
c = c[:, list(np.argmax(c, axis=1))]#ordering the cluster to where it shows the highest number of matching with the true labels
c

Performance on the test set


array([[12,  0,  0],
       [ 2, 14, 14],
       [ 0,  6,  6]], dtype=int64)

In [29]:
#Generating random variables from multivariate Student-t distribution 
###https://stackoverflow.com/questions/29798795/multivariate-student-t-distribution-with-python##
def multivariatet(mu,Sigma,dof,m):
    '''
    Output:
    Produce M samples of d-dimensional multivariate t distribution
    Input:
    mu = mean (d dimensional numpy array or scalar)
    Sigma = scale matrix (dxd numpy array)
    dof = degrees of freedom
    m = # of samples to produce
    '''
    d = Sigma.shape[1]
    g = np.tile(np.random.gamma(dof/2.,2./dof,m),(d,1)).T#From https://en.wikipedia.org/wiki/Multivariate_t-distribution, and the relationship between gamma distribution and chi-squared distribution
    #Page 4 from http://users.isy.liu.se/en/rt/roth/student.pdf
    Z = np.random.multivariate_normal(np.zeros(d),Sigma,m)
    return mu + Z/np.sqrt(g[:])

In [34]:
#Testing the model with arbitrary data set

n1 = 10
df1 = 20.5
m1 = 200
mean1 = np.random.randn(n1).reshape(-1, 1)
Sigma1 = np.random.randn(n1, n1) 
Sigma1 = np.dot(Sigma1, Sigma1.T)
#Sigma1 = 20 * np.eye(n1)
print(np.linalg.matrix_rank(Sigma1))#Need to ensure that the cov is full rank, otherwise the inverse of the covariance wouldn't exist
X_train1 = multivariatet(mean1.reshape(1, -1), Sigma1, df1, m1)
ytrain1 = np.array([0]*m1).reshape(-1, 1)
#print(X_train1.shape)
#Check its mean 
c1 = np.mean(X_train1, axis=0)
c2 = (df1/(df1-2)) * np.cov(X_train1.T)
#print("Comparing Means")
#[print(f"x1:{x1}|x2:{x2}") for x1, x2 in  zip(mean1, c1)];
#print("Comparing Covariance")
#[print(f"new row\n x1:{x1}\nx2:{x2}") for x1, x2 in  zip(Sigma1, c2)];
# I am going to assume the mean and covariance that would be estimated will have the following form
mean1 = c1
cov1 = c2
#################################
n2 = 10
df2 = 7.5
m2 = 200
mean2 = np.random.randn(n2).reshape(-1, 1)
Sigma2 = np.random.randn(n2, n2) 
Sigma2 = np.dot(Sigma2, Sigma2.T)
#Sigma2 = 60 * np.eye(n2)
print(np.linalg.matrix_rank(Sigma2))#Need to ensure that the cov is full rank, otherwise the inverse of the covariance wouldn't exist
X_train2 = multivariatet(mean2.reshape(1, -1), Sigma2, df2, m2)
ytrain2 = np.array([1]*m2).reshape(-1, 1)
#print(X_train2.shape)
#Check its mean 
c1 = np.mean(X_train2, axis=0)
c2 = (df2/(df2-2)) * np.cov(X_train2.T)
#print("Comparing Means")
#[print(f"x1:{x1}|x2:{x2}") for x1, x2 in  zip(mean2, c1)];
#print("Comparing Covariance")
#[print(f"new row\n x1:{x1}\nx2:{x2}") for x1, x2 in  zip(Sigma2, c2)];
# I am going to assume the mean and covariance that would be estimated will have the following form
mean2 = c1
cov2 = c2
#################################

n3 = 10
df3 = 43.5
m3 = 200
mean3 = np.random.randn(n3).reshape(-1, 1)
Sigma3 = np.random.randn(n3, n3) 
Sigma3 = np.dot(Sigma3, Sigma3.T)
#Sigma3 = 99 * np.eye(n3)
print(np.linalg.matrix_rank(Sigma3))#Need to ensure that the cov is full rank, otherwise the inverse of the covariance wouldn't exist
X_train3 = multivariatet(mean3.reshape(1, -1), Sigma3, df3, m3)
ytrain3 = np.array([2]*m3).reshape(-1, 1)
#print(X_train3.shape)
#Check its mean 
c1 = np.mean(X_train3, axis=0)
c2 = (df3/(df3-2)) * np.cov(X_train3.T)
#print("Comparing Means")
#[print(f"x1:{x1}|x2:{x2}") for x1, x2 in  zip(mean3, c1)];
#print("Comparing Covariance")
#[print(f"new row\n x1:{x1}\nx2:{x2}") for x1, x2 in  zip(Sigma3, c2)];
# I am going to assume the mean and covariance that would be estimated will have the following form
mean3 = c1
cov3 = c2


10
10
10


In [35]:
X_train = np.vstack([X_train1, X_train1, X_train3])
y_train = np.vstack([ytrain1, ytrain2, ytrain3])
y_train.shape

(600, 1)

In [36]:
X_train = np.vstack([X_train1, X_train1, X_train3])
y_train = np.vstack([ytrain1, ytrain2, ytrain3])
choices = np.random.permutation(np.arange(0, X_train.shape[0]))
X_train = X_train[choices, :]
y_train = y_train[choices]
#Randomly Initialized
df = [20.5, 7.5, 43.5]
#df = [30, 30 , 30]
k=3
model = Mixtures_ofMultiVariate_Student_t_model(X_train, k, df, "kmeans")
means_of_each_clusters, covariance_within_each_clusters, parameters_of_mixing_latent_parameter = model.fit(1000)

pred = model.prediction_dataset(X_train)
print("Performance on the training set")
#print(sklearn.metrics.confusion_matrix(y_train, pred))
c = sklearn.metrics.confusion_matrix(y_train, pred)
#c[:, list(np.argmax(c, axis=1))]#ordering the cluster to where it shows the highest number of matching with the true labels
c

count:0, max_iterations1000, past:10, future:0
count:1, max_iterations1000, past:76558.59948251564, future:63042.563887490694
count:2, max_iterations1000, past:63042.563887490694, future:62480.342083516385
count:3, max_iterations1000, past:62480.342083516385, future:62267.77309549749
count:4, max_iterations1000, past:62267.77309549749, future:62212.105536786235
count:5, max_iterations1000, past:62212.105536786235, future:62086.27793319166
count:6, max_iterations1000, past:62086.27793319166, future:61911.96962116065
count:7, max_iterations1000, past:61911.96962116065, future:61720.224273176274
count:8, max_iterations1000, past:61720.224273176274, future:61545.25346652394
count:9, max_iterations1000, past:61545.25346652394, future:61383.888717597416
count:10, max_iterations1000, past:61383.888717597416, future:61288.44921967737
count:11, max_iterations1000, past:61288.44921967737, future:61092.43305767626
count:12, max_iterations1000, past:61092.43305767626, future:61001.46022445557
coun

array([[200,   0,   0],
       [200,   0,   0],
       [200,   0,   0]], dtype=int64)

In [None]:
print("Comparing Means")
[print(f"x1:{x1}|x2:{x2}") for x1, x2 in  zip(mean1, means_of_each_clusters[2])];
print("Comparing Covariance")
[print(f"new row\n x1:{x1}\nx2:{x2}") for x1, x2 in  zip(cov1, covariance_within_each_clusters[0])];
covariance_within_each_clusters[0]

### References 
* Chapter 2, Chapter 9 and Chapter 12 from Bishop, C. (2006). Pattern Recognition and Machine Learning. Cambridge: Springer.
* Chapter 5 from McNicholas, P.D. (2016). Mixture Model-Based Classification. Boca Raton: Chapman &
Hall/CRC Press.
* McLachlan, G., and  Krishnan T. (2008). The EM Algorithm and Extensions, Second Edition. New York: Wiley.