In [None]:
import networkx as nx
import numpy as np
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib notebook

Set the parameters

In [None]:
SAMPLE=1000                    # number of data extracted from every cluster
num_components = 3             # number of mixtures -> K
mu_arr = [0,1.,5.]             # array with the centers

Draw the data

In [None]:
X = np.random.normal(loc=mu_arr[0], scale=1, size=SAMPLE) # args: scale=std
for i, mu in enumerate(mu_arr[1:]):
    X = np.append(X, np.random.normal(loc=mu, scale=1, size=SAMPLE))

Plot 

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
sns.distplot(X[:SAMPLE], ax=ax, rug=True)
sns.distplot(X[SAMPLE:SAMPLE*2], ax=ax, rug=True)
sns.distplot(X[SAMPLE*2:], ax=ax, rug=True)

### Derive the equations for the ELBO.

In [None]:
# TODO: we are doing it in the tutorial!

### Implement CAVI updates for GMM

In [None]:
import numpy as np

class UGMM(object):
    '''Univariate GMM with CAVI'''
    def __init__(self, X, K=2, sigma=1):
        self.X = X                 # data
        self.K = K                 # number of clusters
        self.N = self.X.shape[0]   # number of data
        self.sigma2 = sigma**2     # variance

    def _init(self):
        # INITIALIZATION
        self.rho = np.random.dirichlet([np.random.random()*np.random.randint(1, 10)]*self.K, self.N)
        self.m = np.random.randint(int(self.X.min()), high=int(self.X.max()), size=self.K).astype(float)
        self.m += self.X.max()*np.random.random(self.K)
        self.s2 = np.ones(self.K) * np.random.random(self.K)
        print('Init mean')
        print(self.m)
        print('Init s2')
        print(self.s2)

    def get_elbo(self):
        ### TODO: your code here ###
        
        elbo = ...
        
        return elbo

    def fit(self, max_iter=100, tol=1e-10):
        # INIT
        self._init()
        self.elbo_values = [self.get_elbo()]
        self.m_history = [self.m]
        self.s2_history = [self.s2]
        for iter_ in range(1, max_iter+1):
            # STEP
            self._cavi()
            self.m_history.append(self.m)
            self.s2_history.append(self.s2)
            # EVALUATE ELBO
            self.elbo_values.append(self.get_elbo())
            if iter_ % 5 == 0:
                print(iter_, self.m_history[iter_])
            if np.abs(self.elbo_values[-2] - self.elbo_values[-1]) <= tol:
                print('ELBO converged with ll %.3f at iteration %d'%(self.elbo_values[-1],
                                                                     iter_))
                break

        if iter_ == max_iter:
            print('ELBO ended with ll %.3f'%(self.elbo_values[-1]))


    def _cavi(self):
        self._update_rho()
        self._update_mu()

    def _update_rho(self):
        
        ### TODO: your code here ###
        
    def _update_mu(self):
        
        ### TODO: your code here ###
        

### Fit the model to the data

In [None]:
ugmm = UGMM(X, 3)
ugmm.fit()

### Plot results at convergence

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
sns.distplot(X[:SAMPLE], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m[0], 1, SAMPLE), color='b', hist=False, kde=True,kde_kws={'linestyle':'--'})
sns.distplot(X[SAMPLE:SAMPLE*2], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m[1], 1, SAMPLE), color='r', hist=False, kde=True,kde_kws={'linestyle':'--'})
sns.distplot(X[SAMPLE*2:], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m[2], 1, SAMPLE), color='g', hist=False, kde=True,kde_kws={'linestyle':'--'})
ax.set(title='CAVI convergence')
plt.savefig('figures/L12_GMMexample_itConv.pdf')

### Plot ELBO  
Detect changing points in the ELBO.

In [None]:
fs=20

it1=10
it2=25
plt.figure()
plt.scatter(np.arange(len(ugmm.elbo_values)),ugmm.elbo_values)
plt.scatter(it1,ugmm.elbo_values[it1],marker='s',facecolors='none',edgecolors='r',s=200, linewidth=3)
plt.scatter(it2,ugmm.elbo_values[it2],marker='s',facecolors='none',edgecolors='r',s=200, linewidth=3)
plt.xlim([0,40])
plt.ylim([-950,-100])
plt.xlabel('Iterations',fontsize=fs)
plt.ylabel('ELBO',fontsize=fs)
plt.savefig('figures/L12_GMMexample_ELBO.pdf')

### Plot result at the beginning and at the changing points.

Beginning

In [None]:
it=0
fig, ax = plt.subplots(figsize=(10, 4))
sns.distplot(X[:SAMPLE], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m_history[it][0], 1, SAMPLE), color='b', hist=False, kde=True,kde_kws={'linestyle':'--'})
sns.distplot(X[SAMPLE:SAMPLE*2], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m_history[it][1], 1, SAMPLE), color='r', hist=False, kde=True,kde_kws={'linestyle':'--'})
sns.distplot(X[SAMPLE*2:], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m_history[it][2], 1, SAMPLE), color='g', hist=False, kde=True,kde_kws={'linestyle':'--'})
ax.set(title=f'Iteration {it}')
plt.savefig('figures/L12_GMMexample_it0.pdf')

Changing points (two plots)

In [None]:
it=it1
fig, ax = plt.subplots(figsize=(10, 4))
sns.distplot(X[:SAMPLE], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m_history[it][0], 1, SAMPLE), color='b', hist=False, kde=True,kde_kws={'linestyle':'--'})
sns.distplot(X[SAMPLE:SAMPLE*2], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m_history[it][1], 1, SAMPLE), color='r', hist=False, kde=True,kde_kws={'linestyle':'--'})
sns.distplot(X[SAMPLE*2:], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m_history[it][2], 1, SAMPLE), color='g', hist=False, kde=True,kde_kws={'linestyle':'--'})
ax.set(title=f'Iteration {it1}')
plt.savefig('figures/L12_GMMexample_it1.pdf')

In [None]:
it=it2
fig, ax = plt.subplots(figsize=(10, 4))
sns.distplot(X[:SAMPLE], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m_history[it][0], 1, SAMPLE), color='b', hist=False, kde=True,kde_kws={'linestyle':'--'})
sns.distplot(X[SAMPLE:SAMPLE*2], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m_history[it][1], 1, SAMPLE), color='r', hist=False, kde=True,kde_kws={'linestyle':'--'})
sns.distplot(X[SAMPLE*2:], ax=ax, hist=True, norm_hist=True)
sns.distplot(np.random.normal(ugmm.m_history[it][2], 1, SAMPLE), color='g', hist=False, kde=True,kde_kws={'linestyle':'--'})
ax.set(title=f'Iteration {it2}')
plt.savefig('figures/L12_GMMexample_it2.pdf')

### TO DISCUSS: Calculate accuracy

In [None]:
# greedily choose cluster correspondence to maximize accuracy
# map cluster number from variational approximation to ground truth
var_assignments = ugmm.rho.argmax(axis=1)

#for k in range(num_components):
#    chunk=var_assignments[k*SAMPLE:(k+1)*SAMPLE]
#    print(chunk)
    
var_to_gt = dict()
for k in range(num_components):
    gt_k = max(
        set(range(num_components)).difference(set(var_to_gt.values())),
        key = lambda kk: (var_assignments[kk * SAMPLE : (kk + 1) * SAMPLE] == k).sum()
    )
    var_to_gt[k] = gt_k

In [None]:
var_assignments = np.vectorize(var_to_gt.get)(var_assignments)
acc = (var_assignments == np.repeat(range(num_components), SAMPLE)).sum() / X.shape[0]
print('Accuracy attained:', acc)