In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
plt.rcParams["animation.html"] = "jshtml"
import numpy as np
from scipy.spatial.distance import cdist
from scipy.stats import norm, multivariate_normal

# GMM

In this notebook, we will explore the GMM algorithm in its simplest form.
The data can be generated as in the previous notebook.

In [None]:
X,y,centers = make_blobs(
    n_samples=1000,
    n_features=2,
    return_centers=True,
    random_state=42
)

# TODO: Shuffle and split the data

plt.scatter(X[:,0], X[:,1], c=y)
plt.scatter(centers[:,0], centers[:,1], c='r', s=120)
plt.legend(["data", "centers"])
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")

## Fitting normal distribution to data
use `norm.fit` and `norm.pdf` to evaluate pdf for plotting

In [None]:
data = np.random.randn(1000)/10 + 2
plt.hist(data, density=True)

mu, std = norm.fit(data)

x_plot = np.linspace(-3,3,100)
plt.plot(x_plot, norm.pdf(x_plot, mu, std))

For multivariate normal distribution, we can use `multivariate_normal` from `scipy.stats`

In [None]:
data = np.random.randn(1000,2)/10 + 2
plt.scatter(data[:,0], data[:,1])

def plot_mv_normal(mu,sigm,low,high,res):
    grid_1d=np.linspace(low,high,res)
    xx,yy = np.meshgrid(grid_1d,grid_1d)

    d = multivariate_normal(mu,sigm)
    z = d.pdf(np.dstack((xx,yy)))
    return plt.contour(grid_1d,grid_1d,z)

mu, std = multivariate_normal.fit(data)

plot_mv_normal(mu,std, 1.25, 2.5, 50)

You can start from K-means clustering, but:
- Each cluster is a normal distribution, parameterized by its mean and covariance
- Instead of calculating the distance to the cluster center, we calculate the probability of each point belonging to each cluster
- We can assign each point to the cluster with the highest probability
- The parameters of the normal distributions are updated using the points assigned to each cluster
- The process is repeated until convergence

In [None]:
class GMM_demo:
    def __init__(self, K, max_iters=200, tol=1e-8):
        self.K = K
        self.max_iters = max_iters
        self.tol = tol


#TODO: Implement the required methods for the GMM struct

In [None]:
mod = GMM_demo(K=3)
mod.fit(X)

In [None]:
plt.scatter(X[:,0], X[:,1], c=np.argmax(mod.G,1))
for k in range(mod.K):
    plot_mv_normal(mod.means[k],mod.sigma[k],-10,10,50)