# Bayes - A Nonparametric Bayesian Approach to Modeling Overlapping Clusters

In [1]:
%matplotlib inline

In [62]:
from matplotlib.pyplot import figure, show
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import numpy.random as npr
from scipy.stats import multivariate_normal 
from scipy.stats import beta

## Infinite Overlapping Mixture Model with Binary clusters

### Synthetic Data

In [239]:
#Generate synthetic data
K=5
N=10
D=5
N_iter=20
Z=np.zeros([N,K])
X=np.zeros([N,D])
theta=np.zeros([K,D])
alpha_prior=5

#MCMC initialization
P_Z=np.zeros([N,K])
norm_lh=0
accept_proba=0

#randomize Z matrix
for i in range(N):
    ind1=int(npr.uniform(0,K))
    ind2=int(npr.uniform(0,K))
    ind3=int(npr.uniform(0,K))
    Z[i,ind1]=1
    if(npr.uniform(0,1)<0.5): #assume 50% chance of taking a second category
        Z[i,ind2]=1
    if(npr.uniform(0,1)<0.3): #assume 30% chance of taking a third category
        Z[i,ind3]=1

#randomize X matrix
for i in range(N):
    ind_dim=npr.uniform(0,1,D)
    for j in range(D):  #fill each row with randomly assign ones, let 2/3 proba of a given dimension to be = 1
        if ind_dim[j]<0.66:
            X[i,j]=1
    if np.sum(X[i,:])==0:  #at least one dimension equal one to avoid empty observations
        X[i,int(npr.uniform(0,D))]=1

#randomize theta based on beta(alpha/K,1) prior
for k in range(K):
    theta[k,:]=beta.rvs(alpha_prior/K,1,size=D)

theta_init=theta

In [240]:
Z

array([[ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  1.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  1.,  1.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  0.,  0.]])

In [241]:
X

array([[ 0.,  1.,  0.,  1.,  0.],
       [ 0.,  1.,  1.,  1.,  1.],
       [ 0.,  1.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  1.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  1.,  1.,  0.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  0.]])

In [242]:
theta

array([[  7.98370927e-01,   5.29214537e-01,   9.34274318e-01,
          2.23345647e-03,   5.51346827e-01],
       [  8.80578689e-01,   1.86183953e-01,   7.78201299e-01,
          9.65584014e-01,   4.06134714e-01],
       [  9.03423666e-01,   9.07834787e-01,   5.55311126e-01,
          6.03612812e-01,   2.40907650e-01],
       [  1.28597849e-01,   4.93286357e-01,   2.62421728e-01,
          2.09426033e-01,   9.04476047e-01],
       [  1.18281011e-01,   5.29468412e-04,   7.28584961e-01,
          9.69094483e-02,   4.77356838e-01]])

In [243]:
def m_without_i_k(Z, i, k):
    result=0
    for j in range(N):
        if j!=i:
            result+= Z[j,k]
    return result

In [244]:
#LIKELIHOOD density of observation i, k fixed
def likelihood_ber(X,Z,theta,i,k):
    temp=0
    norm_temp=0
    lh=0
    for d in range(D):
        temp=temp+Z[i,k]*X[i,d]*np.log(theta[k,d]/(1-theta[k,d]))
    lh=np.exp(temp)
    return lh

In [245]:
#DEFINE FUNCTIONS FOR METROPOLIS ALGO ELEMENTS
def proposal_beta(theta):
    omega=10
    return (beta.rvs(omega*theta,omega*(1-theta)))

#transition probability
def trans_proba_beta(theta,theta_param,k,d):
    omega=0.5
    theta_param_value=theta_param[k]
    theta_value=theta[k]
    return (beta.pdf(theta_value,omega*theta_param_value,omega*(1-theta_param_value)))

#LIKELIHOOD OF K DIMENSIONAL ARRAY (for MHA algo)
def likelihood_ber_d(X,Z,theta_vect,i,d):
    temp=np.zeros(K)
    norm_temp=0
    lh=np.zeros(K)
    for k in range(K):
        temp[k]=temp[k]+Z[i,k]*X[i,d]*np.log(theta_vect[d]/(1-theta_vect[d]))
        lh[k]=np.exp(temp[k])
    return lh

#### MCMC Algorithm

In [246]:
#NORMALIZATION CONSTANT
norm_lh=0
for k in range(K):
    norm_lh=norm_lh+likelihood_ber(X,Z,theta,i,k)
print(norm_lh)

170.06003805


In [247]:
for j in range(N_iter):
    print("iteration n°",j)
    for i in range(N):
        print("i=",i)
        print("___________1.compute probability of observation i taking category k_________")
        for k in range(K):
            if (m_without_i_k(Z,i,k)>0 and Z[i,k]==0):  #we care only about existing clusters
                print("k=",k)
                Z_cond=np.copy(Z) #to compute P[Z=1|...) likelihood must be conditional on Z[i,k]=1
                Z_cond[i,k]=1
                P_Z[i,k]=(m_without_i_k(Z,i,k)/N)*likelihood_ber(X,Z_cond,theta,i,k)/norm_lh
                #print("likelihood:",likelihood_ber(X,Z_cond,theta,i,k))
                print("proba Z=1:",P_Z[i,k])
        print("_________2.propose adding new categories to existing clusters________")
        for k in range(K):
            if Z[i,k]==0:
                print("propose")
                if npr.uniform(0,1)<P_Z[i,k]:
                    print('accept')
                    Z[i,k]=1
    print("_______3.resample theta|Z,X using MHA_______")
    for d in range(D):
        #extract current theta_d at index k
        theta_current=theta[:,d]
        print("current theta:",theta_current)
        #draw a proposal parameter centered around its current value
        theta_prop=proposal_beta(theta_current)
        print("theta_k_d proposal:",theta_prop)
        #joint prior BETA(alpha/K,1) density over current and proposed parameters
        prior_theta_current=beta.pdf(theta_current,alpha_prior/K,1)
        print("joint prior current theta:",prior_theta_current)
        prior_theta_prop=beta.pdf(theta_prop,alpha_prior/K,1)
        print("joint prior prop theta:",prior_theta_prop)
        #likelihood densities
        lh_theta_current=likelihood_ber_d(X,Z,theta_current,i,d)
        print("likelihood current theta:",lh_theta_current)
        lh_theta_prop=likelihood_ber_d(X,Z,theta_prop,i,d)
        print("likelihood current prop:",lh_theta_prop)
        for k in range(K):
            #transition probabilities theta|theta_prop and theta_prop|theta
            trans_theta_prop=trans_proba_beta(theta_current,theta_prop,k,d)
            trans_theta_current=trans_proba_beta(theta_prop,theta_current,k,d)
            #accept/reject probability
            accept_proba=(np.dot(lh_theta_prop,prior_theta_prop)
                          *trans_theta_current)/(np.dot(lh_theta_current,prior_theta_current)*trans_theta_prop)
            print("acceptance probability=min(1,",accept_proba,")")
            if npr.uniform(0,1)<min(1,accept_proba):
                theta[:,d]=theta_prop

iteration n° 0
i= 0
___________1.compute probability of observation i taking category k_________
k= 1
proba Z=1: 0.0113230865766
k= 2
proba Z=1: 0.0352806313579
k= 4
proba Z=1: 6.68548020587e-08
_________2.propose adding new categories to existing clusters________
propose
propose
propose
propose
i= 1
___________1.compute probability of observation i taking category k_________
k= 1
proba Z=1: 0.0271693941037
k= 4
proba Z=1: 1.63914274816e-07
_________2.propose adding new categories to existing clusters________
propose
propose
propose
i= 2
___________1.compute probability of observation i taking category k_________
k= 2
proba Z=1: 0.0352806313579
k= 3
proba Z=1: 0.000758215045363
k= 4
proba Z=1: 6.68548020587e-08
_________2.propose adding new categories to existing clusters________
propose
propose
propose
propose
i= 3
___________1.compute probability of observation i taking category k_________
k= 1
proba Z=1: 0.00203518309721
_________2.propose adding new categories to existing clusters_

  import sys
  import sys


ValueError: Domain error in arguments.