# Bayes - A Nonparametric Bayesian Approach to Modeling Overlapping Clusters

In [1]:
%matplotlib inline

In [62]:
from matplotlib.pyplot import figure, show
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import numpy.random as npr
from scipy.stats import multivariate_normal 
from scipy.stats import beta

## Infinite Overlapping Mixture Model with Binary clusters

### Synthetic Data

In [92]:
#Generate synthetic data
K=5
N=10
D=5
N_iter=5
Z=np.zeros([N,K])
X=np.zeros([N,D])
theta=np.zeros([K,D])
alpha_prior=1

#MCMC initialization
P_Z=np.zeros([N,K])
norm_lh=0
accept_proba=0

#randomize Z matrix
for i in range(N):
    ind1=int(npr.uniform(0,K))
    ind2=int(npr.uniform(0,K))
    ind3=int(npr.uniform(0,K))
    Z[i,ind1]=1
    if(npr.uniform(0,1)<0.5): #assume 50% chance of taking a second category
        Z[i,ind2]=1
    if(npr.uniform(0,1)<0.1): #assume 10% chance of taking a third category
        Z[i,ind3]=1

#randomize X matrix
for i in range(N):
    ind_dim=npr.uniform(0,1,D)
    for j in range(D):  #fill each row with randomly assign ones, let 1/3 proba of a given dimension to be = 1
        if ind_dim[j]<0.3:
            X[i,j]=1
    if np.sum(X[i,:])==0:  #at least one dimension equal one to avoid empty observations
        X[i,int(npr.uniform(0,D))]=1

#randomize theta based on beta(alpha/K,1) prior
for k in range(K):
    theta[k,:]=beta.rvs(alpha_prior/K,1,size=D)

theta_init=theta

In [93]:
Z

array([[ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  1.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  0.],
       [ 1.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.]])

In [94]:
X

array([[ 0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  1.,  1.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  0.]])

In [95]:
theta

array([[  2.12598980e-02,   2.18407059e-03,   2.93952243e-02,
          1.82334007e-03,   2.59852563e-06],
       [  1.04570517e-02,   3.70286004e-02,   1.46743074e-01,
          5.10389694e-02,   5.20635913e-02],
       [  1.04135321e-02,   8.23204584e-01,   8.60074107e-07,
          1.15220177e-01,   1.30472910e-02],
       [  9.54822305e-03,   1.16629391e-07,   2.50563839e-02,
          4.65074669e-02,   1.09664684e-01],
       [  1.60268516e-01,   4.60946426e-01,   5.49122632e-01,
          1.71280481e-01,   5.52071070e-03]])

In [96]:
def m_without_i_k(Z, i, k):
    result=0
    for j in range(N):
        if j!=i:
            result+= Z[j,k]
    return result

In [97]:
#LIKELIHOOD density of observation i, k fixed
def likelihood_ber(X,Z,theta,i,k):
    temp=0
    norm_temp=0
    lh=0
    for d in range(D):
        temp=temp+Z[i,k]*X[i,d]*np.log(theta[k,d]/(1-theta[k,d]))
    lh=np.exp(temp)
    return lh

In [98]:
#DEFINE FUNCTIONS FOR METROPOLIS ALGO ELEMENTS
def proposal_beta(theta,d):
    omega=0.5
    return (beta.rvs(omega*theta[:,d],omega*(1-theta[:,d])))

#transition probability
def trans_proba_beta(theta,theta_param,k,d):
    omega=0.5
    theta_param_value=theta_param[k]
    theta_value=theta[k]
    return (beta.pdf(theta_value,omega*theta_param_value,omega*(1-theta_param_value)))

#LIKELIHOOD OF K DIMENSIONAL ARRAY (for MHA algo)
def likelihood_ber_d(X,Z,theta_vect,i,d):
    temp=np.zeros(K)
    norm_temp=0
    lh=np.zeros(K)
    for k in range(K):
        temp[k]=temp[k]+Z[i,k]*X[i,d]*np.log(theta_vect[d]/(1-theta_vect[d]))
        lh[k]=np.exp(temp[k])
    return lh

#### MCMC Algorithm

In [99]:
#NORMALIZATION CONSTANT
norm_lh=0
for k in range(K):
    for i in range(N):
        norm_lh=norm_lh+likelihood_ber(X,Z,theta,i,k)
print(norm_lh)

36.7549540138


In [100]:
for j in range(N_iter):
    print("iteration n°",j)
    for i in range(N):
        print("i=",i)
        print("___________1.compute probability of observation i taking category k_________")
        for k in range(K):
            if Z[i,k]==0:  #we care only about categories that are not yet considered for movie i
                print("k=",k)
                P_Z[i,k]=(m_without_i_k(Z,i,k)/N)*likelihood_ber(X,Z,theta,i,k)/norm_lh
            print("proba Z=1:",P_Z[i,k])
        print("_________2.propose adding new clusters________")
        for k in range(K):
            if Z[i,k]==0:
                print("propose")
                if npr.uniform(0,1)<P_Z[i,k]:
                    print('accept')
                    Z[i,k]=1
    print("_______3.resample theta|Z,X using MHA_______")
    for d in range(D):
        #draw a proposal parameter centered around its current value
        theta_prop=proposal_beta(theta,d)
        #extract current theta_d at index k
        theta_current=theta[:,d]
        print("theta_k_d proposal:",theta_prop)
        #joint prior BETA(alpha/K,1) density over current and proposed parameters
        prior_theta_current=beta.pdf(theta_current,alpha_prior/K,1)
        print("joint prior current theta:",prior_theta_current)
        prior_theta_prop=beta.pdf(theta_prop,alpha_prior/K,1)
        print("joint prior prop theta:",prior_theta_prop)
        #likelihood densities
        lh_theta_current=likelihood_ber_d(X,Z,theta_current,i,d)
        print("likelihood current theta:",lh_theta_current)
        lh_theta_prop=likelihood_ber_d(X,Z,theta_prop,i,d)
        print("likelihood current prop:",lh_theta_prop)
        for k in range(K):
            #transition probabilities theta|theta_prop and theta_prop|theta
            trans_theta_prop=trans_proba_beta(theta_current,theta_prop,k,d)
            trans_theta_current=trans_proba_beta(theta_prop,theta_current,k,d)
            #accept/reject probability
            accept_proba=(np.dot(lh_theta_prop,prior_theta_prop)
                          *trans_theta_current)/(np.dot(lh_theta_current,prior_theta_current)*trans_theta_prop)
            print("acceptance probability=",min(1,accept_proba))
            #if npr.uniform(0,1)<min(1,accept_proba):
                #theta[:,d]=theta_prop

iteration n° 0
i= 0
___________1.compute probability of observation i taking category k_________
k= 0
proba Z=1: 0.00816216502101
k= 1
proba Z=1: 0.00544144334734
proba Z=1: 0.0
k= 3
proba Z=1: 0.00544144334734
k= 4
proba Z=1: 0.00816216502101
_________2.propose adding new clusters________
propose
propose
propose
propose
i= 1
___________1.compute probability of observation i taking category k_________
k= 0
proba Z=1: 0.00816216502101
k= 1
proba Z=1: 0.00544144334734
proba Z=1: 0.0
k= 3
proba Z=1: 0.00544144334734
k= 4
proba Z=1: 0.00816216502101
_________2.propose adding new clusters________
propose
propose
propose
propose
i= 2
___________1.compute probability of observation i taking category k_________
proba Z=1: 0.0
k= 1
proba Z=1: 0.00544144334734
proba Z=1: 0.0
proba Z=1: 0.0
k= 4
proba Z=1: 0.00816216502101
_________2.propose adding new clusters________
propose
propose
i= 3
___________1.compute probability of observation i taking category k_________
k= 0
proba Z=1: 0.0081621650210




proba Z=1: 0.00544144334734
proba Z=1: 0.0
k= 3
proba Z=1: 0.00544144334734
k= 4
proba Z=1: 0.00816216502101
_________2.propose adding new clusters________
propose
propose
propose
i= 8
___________1.compute probability of observation i taking category k_________
proba Z=1: 0.0
proba Z=1: 0.0
k= 2
proba Z=1: 0.0108828866947
k= 3
proba Z=1: 0.00544144334734
k= 4
proba Z=1: 0.00816216502101
_________2.propose adding new clusters________
propose
propose
propose
i= 9
___________1.compute probability of observation i taking category k_________
k= 0
proba Z=1: 0.00816216502101
k= 1
proba Z=1: 0.00544144334734
k= 2
proba Z=1: 0.0108828866947
k= 3
proba Z=1: 0.00544144334734
proba Z=1: 0.0
_________2.propose adding new clusters________
propose
propose
propose
propose
_______3.resample theta|Z,X using MHA_______
theta_k_d proposal: [  7.36428866e-005   7.79297824e-001   1.44580793e-295   2.46999220e-012
   2.64549845e-017]
joint prior current theta: [ 4.35492876  7.68250039  7.70817466  8.262128

i= 9
___________1.compute probability of observation i taking category k_________
k= 0
proba Z=1: 0.00816216502101
k= 1
proba Z=1: 0.00544144334734
k= 2
proba Z=1: 0.0108828866947
k= 3
proba Z=1: 0.00544144334734
proba Z=1: 0.0
_________2.propose adding new clusters________
propose
propose
propose
propose
_______3.resample theta|Z,X using MHA_______
theta_k_d proposal: [  1.40914321e-013   4.97262133e-160   4.10717792e-026   2.15454918e-102
   2.07754371e-001]
joint prior current theta: [ 4.35492876  7.68250039  7.70817466  8.26212861  0.86526955]
joint prior prop theta: [  3.81826345e+009   5.54321450e+126   4.07563405e+019   4.30868620e+080
   7.03055903e-001]
likelihood current theta: [ 1.         1.         1.         1.         0.0217217]
likelihood current prop: [  1.00000000e+00   1.00000000e+00   1.00000000e+00   1.00000000e+00
   1.40914321e-13]
acceptance probability= 1
acceptance probability= 1
acceptance probability= 1
acceptance probability= 1
acceptance probability= 1
the