# Bayes - A Nonparametric Bayesian Approach to Modeling Overlapping Clusters

In [1]:
%matplotlib inline

In [62]:
from matplotlib.pyplot import figure, show
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import numpy.random as npr
from scipy.stats import multivariate_normal 
from scipy.stats import beta

## Infinite Overlapping Mixture Model with Binary clusters

### Synthetic Data

In [134]:
#Generate synthetic data
K=5
N=10
D=5
N_iter=5
Z=np.zeros([N,K])
X=np.zeros([N,D])
theta=np.zeros([K,D])
alpha_prior=1

#MCMC initialization
P_Z=np.zeros([N,K])
norm_lh=0
accept_proba=0

#randomize Z matrix
for i in range(N):
    ind1=int(npr.uniform(0,K))
    ind2=int(npr.uniform(0,K))
    ind3=int(npr.uniform(0,K))
    Z[i,ind1]=1
    if(npr.uniform(0,1)<0.5): #assume 50% chance of taking a second category
        Z[i,ind2]=1
    if(npr.uniform(0,1)<0.1): #assume 10% chance of taking a third category
        Z[i,ind3]=1

#randomize X matrix
for i in range(N):
    ind_dim=npr.uniform(0,1,D)
    for j in range(D):  #fill each row with randomly assign ones, let 1/3 proba of a given dimension to be = 1
        if ind_dim[j]<0.3:
            X[i,j]=1
    if np.sum(X[i,:])==0:  #at least one dimension equal one to avoid empty observations
        X[i,int(npr.uniform(0,D))]=1

#randomize theta based on beta(alpha/K,1) prior
for k in range(K):
    theta[k,:]=beta.rvs(alpha_prior/K,1,size=D)

theta_init=theta

In [135]:
Z

array([[ 0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  1.],
       [ 0.,  1.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  1.]])

In [136]:
X

array([[ 0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  1.],
       [ 1.,  1.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.,  1.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.]])

In [137]:
theta

array([[  5.22253109e-04,   7.07417778e-01,   6.64057698e-03,
          1.28879668e-12,   5.72501638e-01],
       [  1.78268284e-02,   3.98237183e-02,   8.01272490e-01,
          3.88285365e-02,   7.65531133e-01],
       [  3.21239716e-06,   3.65359542e-05,   2.27229963e-04,
          7.49050103e-02,   1.40901759e-02],
       [  4.11770225e-01,   3.11727195e-04,   5.40322075e-03,
          2.37458261e-01,   1.68528951e-06],
       [  1.16292129e-07,   1.54366758e-03,   1.30017679e-03,
          1.41616746e-02,   5.32521543e-02]])

In [138]:
def m_without_i_k(Z, i, k):
    result=0
    for j in range(N):
        if j!=i:
            result+= Z[j,k]
    return result

In [139]:
#LIKELIHOOD density of observation i, k fixed
def likelihood_ber(X,Z,theta,i,k):
    temp=0
    norm_temp=0
    lh=0
    for d in range(D):
        temp=temp+Z[i,k]*X[i,d]*np.log(theta[k,d]/(1-theta[k,d]))
    lh=np.exp(temp)
    return lh

In [140]:
#DEFINE FUNCTIONS FOR METROPOLIS ALGO ELEMENTS
def proposal_beta(theta):
    omega=0.5
    return (beta.rvs(omega*theta,omega*(1-theta)))

#transition probability
def trans_proba_beta(theta,theta_param,k,d):
    omega=0.5
    theta_param_value=theta_param[k]
    theta_value=theta[k]
    return (beta.pdf(theta_value,omega*theta_param_value,omega*(1-theta_param_value)))

#LIKELIHOOD OF K DIMENSIONAL ARRAY (for MHA algo)
def likelihood_ber_d(X,Z,theta_vect,i,d):
    temp=np.zeros(K)
    norm_temp=0
    lh=np.zeros(K)
    for k in range(K):
        temp[k]=temp[k]+Z[i,k]*X[i,d]*np.log(theta_vect[d]/(1-theta_vect[d]))
        lh[k]=np.exp(temp[k])
    return lh

#### MCMC Algorithm

In [141]:
#NORMALIZATION CONSTANT
norm_lh=0
for k in range(K):
    norm_lh=norm_lh+likelihood_ber(X,Z,theta,i,k)
print(norm_lh)

3.05926028541


In [142]:
for j in range(N_iter):
    print("iteration n°",j)
    for i in range(N):
        print("i=",i)
        print("___________1.compute probability of observation i taking category k_________")
        for k in range(K):
            if Z[i,k]==0:  #we care only about categories that are not yet considered for movie i
                print("k=",k)
                P_Z[i,k]=(m_without_i_k(Z,i,k)/N)*likelihood_ber(X,Z,theta,i,k)/norm_lh
            print("proba Z=1:",P_Z[i,k])
        print("_________2.propose adding new clusters________")
        for k in range(K):
            if Z[i,k]==0:
                print("propose")
                if npr.uniform(0,1)<P_Z[i,k]:
                    print('accept')
                    Z[i,k]=1
    print("_______3.resample theta|Z,X using MHA_______")
    for d in range(D):
        #extract current theta_d at index k
        theta_current=theta[:,d]
        print("current theta:",theta_current)
        #draw a proposal parameter centered around its current value
        theta_prop=proposal_beta(theta_current)
        print("theta_k_d proposal:",theta_prop)
        #joint prior BETA(alpha/K,1) density over current and proposed parameters
        prior_theta_current=beta.pdf(theta_current,alpha_prior/K,1)
        print("joint prior current theta:",prior_theta_current)
        prior_theta_prop=beta.pdf(theta_prop,alpha_prior/K,1)
        print("joint prior prop theta:",prior_theta_prop)
        #likelihood densities
        lh_theta_current=likelihood_ber_d(X,Z,theta_current,i,d)
        print("likelihood current theta:",lh_theta_current)
        lh_theta_prop=likelihood_ber_d(X,Z,theta_prop,i,d)
        print("likelihood current prop:",lh_theta_prop)
        for k in range(K):
            #transition probabilities theta|theta_prop and theta_prop|theta
            trans_theta_prop=trans_proba_beta(theta_current,theta_prop,k,d)
            trans_theta_current=trans_proba_beta(theta_prop,theta_current,k,d)
            #accept/reject probability
            accept_proba=(np.dot(lh_theta_prop,prior_theta_prop)
                          *trans_theta_current)/(np.dot(lh_theta_current,prior_theta_current)*trans_theta_prop)
            print("acceptance probability=min(1,",accept_proba,")")
            #if npr.uniform(0,1)<min(1,accept_proba):
                #theta[:,d]=theta_prop

iteration n° 0
i= 0
___________1.compute probability of observation i taking category k_________
k= 0
proba Z=1: 0.13075056147
k= 1
proba Z=1: 0.0653752807349
proba Z=1: 0.0
k= 3
proba Z=1: 0.0653752807349
k= 4
proba Z=1: 0.0980629211023
_________2.propose adding new clusters________
propose
propose
accept
propose
propose
i= 1
___________1.compute probability of observation i taking category k_________
proba Z=1: 0.0
k= 1
proba Z=1: 0.0980629211023
k= 2
proba Z=1: 0.0980629211023
k= 3
proba Z=1: 0.0653752807349
k= 4
proba Z=1: 0.0980629211023
_________2.propose adding new clusters________
propose
propose
propose
propose
i= 2
___________1.compute probability of observation i taking category k_________
proba Z=1: 0.0
k= 1
proba Z=1: 0.0980629211023
k= 2
proba Z=1: 0.0980629211023
k= 3
proba Z=1: 0.0653752807349
k= 4
proba Z=1: 0.0980629211023
_________2.propose adding new clusters________
propose
propose
propose
propose
i= 3
___________1.compute probability of observation i taking catego




theta_k_d proposal: [  0.00000000e+000   8.00808217e-001   1.51837991e-002   6.92111205e-001
   2.65080953e-112]
joint prior current theta: [  6.49953772e+08   2.68973304e+00   1.59008852e+00   6.31771702e-01
   6.02751509e+00]
joint prior prop theta: [             inf   2.38895065e-01   5.70066654e+00   2.68465999e-01
   3.65028839e+88]
likelihood current theta: [ 1.  1.  1.  1.  1.]
likelihood current prop: [ 1.  1.  1.  1.  1.]
acceptance probability=min(1, nan )
acceptance probability=min(1, inf )
acceptance probability=min(1, inf )
acceptance probability=min(1, inf )
acceptance probability=min(1, inf )
current theta: [  5.72501638e-01   7.65531133e-01   1.40901759e-02   1.68528951e-06
   5.32521543e-02]
theta_k_d proposal: [  9.89541391e-02   9.90146671e-01   5.51116056e-57   0.00000000e+00
   6.50821051e-02]
joint prior current theta: [  3.12470202e-01   2.47662190e-01   6.05197135e+00   8.31170999e+03
   2.08910491e+00]
joint prior prop theta: [  1.27257335e+00   2.01590643e-01

joint prior current theta: [  11.04753391    0.23878432  164.3832331    13.02890228   40.7219032 ]
joint prior prop theta: [  4.53560558e+31   4.76197376e+01              inf   8.41625535e+08
   1.16243801e+50]
likelihood current theta: [ 1.  1.  1.  1.  1.]
likelihood current prop: [ nan  nan  nan  nan  nan]
acceptance probability=min(1, nan )
acceptance probability=min(1, nan )
acceptance probability=min(1, nan )
acceptance probability=min(1, nan )
acceptance probability=min(1, nan )
current theta: [  1.28879668e-12   3.88285365e-02   7.49050103e-02   2.37458261e-01
   1.41616746e-02]
theta_k_d proposal: [  0.00000000e+00   4.60272011e-19   6.90201815e-01   3.02612090e-14
   1.25213478e-03]
joint prior current theta: [  6.49953772e+08   2.68973304e+00   1.59008852e+00   6.31771702e-01
   6.02751509e+00]
joint prior prop theta: [             inf   9.34584002e+13   2.69059988e-01   1.30713660e+10
   4.19671167e+01]
likelihood current theta: [ 1.  1.  1.  1.  1.]
likelihood current prop

  return np.exp(self._logpdf(x, a, b))
