# The Gaussian Mixture model

 Unlike methods like K-Means, which assign each point to a single cluster, GMM gives a probability for each point to belong to different clusters, making it more flexible for complex datasets where clusters may overlap or have different shapes.

## Task 4, Initializing the GMM

* X is a numpy.ndarray of shape (n, d) containing the data set
- k is a positive integer containing the number of cluster
* You are not allowed to use any loops
- Returns: pi, m, S, or None, None, None on failure
pi is a numpy.ndarray of shape (k,) containing the priors for each cluster, initialized evenly
m is a numpy.ndarray of shape (k, d) containing the centroid means for each cluster, initialized with K-means
S is a numpy.ndarray of shape (k, d, d) containing the covariance matrices for each cluster, initialized as identity matrices

In [3]:
#!/usr/bin/env python3
"""
Initializes variables for a Gaussian mixture model.
"""


import numpy as np
kmeans = __import__('1-kmeans').kmeans


def initialize(X, k):
    """
    Initializes variables for a Gaussian Mixture Mode
    """
    if not isinstance(X, np.ndarray) or len(X.shape) != 2:
        return None, None, None
    if not isinstance(k, int) or k < 1:
        return None, None, None
    
    n, d = X.shape
    
    phi = np.ones(k) / k
    
    m, _ = kmeans(X, k)
    
    S = np.tile(np.identity(d), (k, 1)).reshape(k, d, d)
    
    return phi, m, S


In [4]:
# main func

if __name__ == '__main__':
    np.random.seed(11)
    a = np.random.multivariate_normal([30, 40], [[75, 5], [5, 75]], size=10000)
    b = np.random.multivariate_normal([5, 25], [[16, 10], [10, 16]], size=750)
    c = np.random.multivariate_normal([60, 30], [[16, 0], [0, 16]], size=750)
    d = np.random.multivariate_normal([20, 70], [[35, 10], [10, 35]], size=1000)
    X = np.concatenate((a, b, c, d), axis=0)
    np.random.shuffle(X)
    pi, m, S = initialize(X, 4)
    print(pi)
    print(m)
    print(S)

[0.25 0.25 0.25 0.25]
[[54.73711515 31.81393242]
 [16.84012557 31.20248225]
 [21.43215816 65.50449077]
 [32.3301925  41.80664127]]
[[[1. 0.]
  [0. 1.]]

 [[1. 0.]
  [0. 1.]]

 [[1. 0.]
  [0. 1.]]

 [[1. 0.]
  [0. 1.]]]


## PDF

Calculates the probability density function of a Gaussian distribution

* X is a numpy.ndarray of shape (n, d) containing the data points whose PDF should be evaluated
- m is a numpy.ndarray of shape (d,) containing the mean of the distribution
* You are not allowed to use any loops
- You are not allowed to use the function numpy.diag or the method numpy.ndarray.diagonal
* Returns: P, or None on failure
P is a numpy.ndarray of shape (n,) containing the PDF values for each data point
- All values in P should have a minimum value of 1e-300

In [1]:
#!/usr/bin/env python3
"""
Calculates the probability density function of a Gaussian Distribution
"""


import numpy as np


def pdf(X, m, S):
    """
    Calculates the probability density function of a Gaussian Distribution
    """
    if not isinstance(X, np.ndarray) or len(X.shape) != 2:
        return None
    if not isinstance(m, np.ndarray) or len(m.shape) != 1:
        return None
    if not isinstance(S, np.ndarray) or len(S.shape) != 2:
        return None
    if S.shape[0] != S.shape[1]:
        return None
    
    n, d = X.shape
    mean = m
    x_m = X - mean
    
    det_S = np.linalg.det(S)
    
    inv_S = np.linalg.inv(S)
    
    part_1_dem = np.sqrt(det_S) * ((2 * np.pi) ** (d / 2))
    
    part_2 = np.matmul(x_m, inv_S)
    
    part_2_1 = np.sum(x_m * part_2, axis=1)
    
    part_2_2 = np.sum(x_m * part_2, axis=1)
    
    pdf = part_2_2 / part_1_dem
    P = np.where(pdf < 1e-300, 1e-300, pdf)
    return P


In [2]:
# main func

if __name__ == '__main__':
    np.random.seed(0)
    m = np.array([12, 30, 10])
    S = np.array([[36, -30, 15], [-30, 100, -20], [15, -20, 25]])
    X = np.random.multivariate_normal(m, S, 10000)
    P = pdf(X, m, S)
    print(P)

[0.00121828 0.00272588 0.00026965 ... 0.00048263 0.0002508  0.00138663]
