In [119]:
import argparse
import pandas as pd
import numpy as np
from scipy.stats import multivariate_normal as mvn
from numpy.core.umath_tests import matrix_multiply as mm

In [120]:
def read_data(file):
    return pd.read_csv(file,header=None)

In [121]:
def gen_random(col):
    minval = col.min()
    maxval = col.max()

    # Generate a random number from a uniform distribution of the min and max of the column.
    return np.random.uniform(minval,maxval,1)[0]

In [122]:
def EM_Algorithm(iris, k, e):
    
    n = len(iris)
    
    # E-Step
    
    # Init values
    
    sigma = np.array([np.eye(4)] * k)
    
    cl_mu = []
    cl_p = []
    for i in range(k):
        atr_mu = []
        for col in iris[[0,1,2,3]]:
            atr_mu.append(gen_random(iris[col]))
        cl_mu.append(atr_mu)
        cl_p.append(1/k)
    
    cl_mus = np.array(cl_mu)
    
    
    like_old = 0
    i = 0
    diff = 1
    
    while diff > e and i < 1000000:
            ws = np.zeros((k, n))
            
            # for each cluster calculate the probability
            for j in range(k):
                ws[j, :] = cl_p[j] * mvn(cl_mus[j], sigma[j]).pdf(iris.loc[:,0:3])
            ws /= ws.sum(0)
            
            
            # M Step
            
            # update probabilities
            cl_p = ws.sum(axis=1)
            cl_p /= n
            
            cl_mus = np.dot(ws, iris.loc[:,0:3])
            cl_mus /= ws.sum(1)[:, None]

            #print(mus)
            # update sigmas
            sigma = np.zeros((k, 4, 4))

            for j in range(k):
                # get values from data frame, subtract mean values and convert to numpy array
                ys = (iris.loc[:,0:3] - cl_mus[j, :]).to_numpy()

                # Calculate sigmas
                sigma[j] = (ws[j, :, None, None] * mm(ys[:, :, None], ys[:, None, :])).sum(axis=0)
            sigma /= ws.sum(axis=1)[:, None, None]

            # init temporary log likelihood variable
            like_new = 0
        
            # caclulate probability for each
            for p, mu, sig in zip(cl_p, cl_mus, sigma):
                like_new += p * mvn(mu, sig).pdf(iris.loc[:,0:3].to_numpy())

            like_new = np.log(like_new).sum()

            diff = np.abs(like_new - like_old)
            like_old = like_new

            # increment counter
            i += 1
    
    print("\n Number of iterations for the convergence is = ", i)
    new_nodes = pd.DataFrame()
    for node, point in enumerate(ws):
        new_nodes[node] = point
    
    new_nodes['tag'] = new_nodes.idxmax(axis=1)
    print("\n The number of nodes in new clusters are - \n", new_nodes.groupby(['tag']).agg('count')[0])
    
    print("\n Mean Matrix for the 3 clusters is - \n", cl_mus)
    
    print("\n Covariance for the 3 clusters are - \n", sigma)
    
    return diff, like_new, cl_p, cl_mus, sigma, i, ws
    

In [123]:
if __name__ == '__main__':
   
    filename = 'iris.txt'
    clusters = 3
    epsilon = 0.001
    
    # Read the text file
    df = read_data(filename)
    
    # Apply EM algorithm
    eps, ll2, Ps, means, covs, iterations, points = EM_Algorithm(df, clusters, epsilon)



 Number of iterations for the convergence is =  25

 The number of nodes in new clusters are - 
 tag
0    50
1    51
2    49
Name: 0, dtype: int64

 Mean Matrix for the 3 clusters is - 
 [[5.00622574 3.41849879 1.46407274 0.2439742 ]
 [6.28644387 2.78225055 4.67362944 1.4481268 ]
 [6.23867405 2.95461529 5.11897605 1.88578571]]

 Covariance for the 3 clusters are - 
 [[[0.12170433 0.09808362 0.01578603 0.01035326]
  [0.09808362 0.14178197 0.01137131 0.01124095]
  [0.01578603 0.01137131 0.02950515 0.00559047]
  [0.01035326 0.01124095 0.00559047 0.01126738]]

 [[0.60718492 0.18087948 0.71191297 0.22103544]
  [0.18087948 0.1422663  0.18602168 0.06866512]
  [0.71191297 0.18602168 0.98339189 0.30827955]
  [0.22103544 0.06866512 0.30827955 0.10715259]]

 [[0.27600154 0.0699252  0.21869407 0.12521287]
  [0.0699252  0.0653043  0.06411204 0.05306872]
  [0.21869407 0.06411204 0.29993333 0.17357957]
  [0.12521287 0.05306872 0.17357957 0.15339671]]]
