# Bayes - A Nonparametric Bayesian Approach to Modeling Overlapping Clusters

In [None]:
%matplotlib inline

In [2]:
from matplotlib.pyplot import figure, show
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from scipy.stats import multivariate_normal 
from scipy.stats import beta

## Infinite Overlapping Mixture Model with Gaussian clusters

In [None]:
X, y = make_blobs(n_samples=1000, centers=3, n_features=2, random_state=0)
df = pd.DataFrame(X, columns=['X1', 'X2'])
df['y']=y
df.head()

In [None]:
#Initialiser le centre des clusters (Facultatif)
clr = KMeans(n_clusters=3)
clr.fit(X,y)
clr.score(X)
clr.cluster_centers_[1].reshape(2,1)

In [None]:
ax = df[y==1].plot(x="X1", y="X2", kind="scatter", label="1", color='r', figsize=(10,10))
df[y==0].plot(x="X1", y="X2", kind="scatter", label="0",color='green', ax=ax)
df[y==2].plot(x="X1", y="X2", kind="scatter", label="2",color='grey', ax=ax)

In [None]:
#Je recupere les "overlapping clusters" avec les kNearestNeighbor
#Si la probailité d'être dans le cluster k est d'au moins 0.1 (à tuner) alors l'observation appartient au cluster k
knn =  KNeighborsClassifier(n_neighbors=30)
knn.fit(X,y)
pred = knn.predict_proba(X)
pred

In [None]:
#Load movie data


In [None]:
#Real clusters matrix
def possible_clusters(X):
    I=X.shape[0]
    J=X.shape[1]
    result = np.zeros(X.shape)
    for i in range(I):
        for j in range(J):
            if X[i,j]>0.1:
                result[i,j]=1
    return result
    
#Z = pd.get_dummies(y).as_matrix()
Z=possible_clusters(pred)

Z

In [None]:
#Create binary data to feed X
N=100
X=np.random.randint(2,size=N*2).reshape(N,2)

## Figure 3 algorithm
###  1. Initialize $\Theta$

In [None]:
#GAUSSIAN CLUSTERS
#mu=multivariate_normal.rvs(mean=np.zeros(2),cov=np.matrix([[1, 0], [0, 1]]),size=3)
#sigma=invwishart.rvs(df=4,size=3,scale=np.matrix([[1, 0], [0, 1]]))

In [None]:
#MULTIVARIATE BERNOULLI
#Initialize Theta matrix K=3 rows, D=2 columns
#D=2
#N=1000
#K=3 fixed for now
theta=np.empty([3,2])
for i in range(0,3):
    for j in range(0,2):
        u=np.random.uniform()
        theta[i,j]=u

In [None]:
theta

###  2. Initialize other elements

In [None]:
NumIters = 1
N = df.shape[0]
Z_hat = np.zeros((N,3)) #Matrix of clusters [observations]*[# clusters - takes 1 if belongs to cluster]
PZ_hat = np.zeros((N,3)) #Matrix of cluster probabilities
K=3

In [None]:
#On prend les n premieres observations et on leur donne les bons clusters associés pour entrainer le modele
n=100
for i in range(n):
    Z_hat[i,]=Z[i,]
Z_hat

###  3. Run algorithm

In [None]:
def m_without_i_k(Z, i, k):
    result=0
    for j in range(Z_hat.shape[0]):
        if j!=i:
            result+= Z_hat[j,k]
    return result

def likelihood_bern(X,Z,theta,i,k):  #not normalized
    temp=0
    lh=0
    for d in range(0,2):
        temp=temp+Z[i,k]*X[i,d]*np.log(theta[k,d]/(1-theta[k,d]))
    lh=np.exp(temp)
    return lh


for j in range(NumIters):
    for i in range(0,N):
        k_plus = [] #k+ is the number of clusters which data points, excluding i, belong to
        for k_ in range(K):
            if Z_hat[i,k_] == 0:
                k_plus.append(k_)  #for each obs, if proba to belong to cluster k_ is null, add it to k_plus
                                   #for data not in the training set, k_plus will take all possible values in 0...K
        print("Z initial:",Z[i,])
        print("k+=",k_plus)
        for k in k_plus:
            if Z_hat[i,k] == 0: #exclude data in the training set for which we already have the true categories
                #z_ik ⇠ zik|z−i,k, xi,theta
                theta_zi=np.zeros(6).reshape([3,2])
                Z_hat[i,k] = 1 #Set Z_hat to one (proposal)
                #Compute theta's of the Bernoulli likelihood function -- EQUATION (7) --
                for d in range(2):
                    num_temp=1
                    den_temp1=1
                    den_temp2=1
                    for k_ in range(K):
                        num_temp=num_temp*(theta[k_,d]**Z[i,k_])
                        den_temp1=den_temp1*((1-theta[k_,d])**Z[i,k_])
                        den_temp2=den_temp2*(theta[k_,d]**Z[i,k_])
                    theta_zi[:,d]=num_temp/(den_temp1+den_temp2)
                #compute bernouilli likelihood (not normalized)
                lh_bern=likelihood_bern(X,Z_hat,theta_zi,i,k)
                #compute matrix of probas of Z
                PZ_hat[i,k] = (m_without_i_k(Z_hat, i, k)/N)*lh_bern
                Z_hat[i,k] = 0  #reset Z_hat to zero
        #Propose adding new clusters 
        #Accept or reject proposal
        print("Proba Z:",PZ_hat[i,])
        print("Z_hat new:",Z_hat[i,])
        for k in k_plus:
            if Z_hat[i,k] == 0:
                u = np.random.uniform(0,1,1)  #PZ_hat is not normalized to [0,1]
                if u[0]<PZ_hat[i,k]:
                    Z_hat[i,k]=1
                    print(i,k)
        print("----------------")
    """
    #Resample theta|Z,X using MH proposal
    prob_A=0
    omega=0.5
    for k in range(K):
        for d in range(0,2):
            #generate proposal theta'(mu' and sigma') based on Beta(omega*theta,omega*(1-theta))
            T_prop=beta.rvs(omega*theta[k,d],omega*(1-theta[k,d]))
            T_mh=beta.rvs(omega*T_prop[k,d],omega*(1-T_prop[k,d]))
            #likelihood of x_d | ...
            lh_mh,lh_prop=np.zeros([2,3])
            for k_ in range(K):
                lh_mh[k_]=likelihood_bern(X,Z_hat,T_mh,i,k_)
                lh_prop[k_]=likelihood_bern(X,Z_hat,T_prop,i,k_)
            #priors beta
            xxx
    """

In [None]:
beta.rvs(omega*theta[k,],omega*(1-theta[k,]))

#### Get movies data from MovieLens

The clusters matrix is Z
The binary matrix is X

In [3]:
Z = pd.read_csv('clusters_matrix.csv', sep=',', index_col=0)
X = pd.read_csv('binary_data_matrix.csv', sep=',', index_col=0)
X.columns = X.columns.astype(int)
Z.columns = Z.columns.astype(int)

#### Initialization of all parameters

Our data are a binary matrix : we can use a multivariate Bernouilli simulation

In [4]:
NumIters = 1
N = X.shape[0]
K = Z.shape[1]
D = X.shape[1]

Z_hat = pd.DataFrame(np.zeros((N,K)), index=Z.index)  #Matrix of clusters [observations]*[# clusters - takes 1 if belongs to cluster]
PZ_hat = pd.DataFrame(np.zeros((N,K)), index=Z.index) #Matrix of cluster probabilities
theta = np.empty([K,D])
np.random.seed(1234)

for i in range(K):
    for j in range(D):
        u=np.random.uniform()
        theta[i,j]=u
        
#On prend les n premieres observations et on leur donne les bons clusters associés pour entrainer le modele
n=100
for i in Z.index[0:n]:
    Z_hat.loc[i] = Z.loc[i].values    
Z_hat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
1357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1537,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
np.random.seed(1234)

def m_without_i_k(Z, i, k):
    result=0
    for j in Z.index:
        if j!=i:
            result+= Z.loc[j,k]
    return result

def likelihood_bern(X,Z,theta,i,k):  #not normalized
    temp=0
    lh=0
    for d in range(0,D):
        temp=temp+Z.loc[i,k]*X.loc[i,X.columns[d]]*np.log(theta[k,d]/(1-theta[k,d]))
    lh=np.exp(temp)
    return lh


for j in range(NumIters):
    for i in Z.index:
        k_plus = [] #k+ is the number of clusters which data points, excluding i, belong to
        for k_ in range(K):
            if Z_hat.loc[i,k_] == 0:
                k_plus.append(k_)  #for each obs, if proba to belong to cluster k_ is null, add it to k_plus
                                   #for data not in the training set, k_plus will take all possible values in 0...K
        print("Z initial:", Z.loc[i,:])
        print("k+=", k_plus)
        
        for k in k_plus:
            if Z_hat.loc[i,k] == 0: #exclude data in the training set for which we already have the true categories
                #z_ik ⇠ zik|z−i,k, xi,theta
                theta_zi = np.zeros((K,D)) 
                Z_hat[i,k] = 1 #Set Z_hat to one (proposal)
                #Compute theta's of the Bernoulli likelihood function -- EQUATION (7) --
                for d in range(D):
                    num_temp=1
                    den_temp1=1
                    den_temp2=1
                    for k_ in range(K):
                        num_temp=num_temp*(theta[k_,d]**Z.loc[i,k_])
                        den_temp1=den_temp1*((1-theta[k_,d])**Z.loc[i,k_])
                        den_temp2=den_temp2*(theta[k_,d]**Z.loc[i,k_])
                    theta_zi[:,d]=num_temp/(den_temp1+den_temp2)
                #compute bernouilli likelihood (not normalized)
                lh_bern=likelihood_bern(X,Z_hat,theta_zi,i,k)
                #compute matrix of probas of Z
                PZ_hat[i,k] = (m_without_i_k(Z_hat, i, k)/N)*lh_bern
                Z_hat[i,k] = 0  #reset Z_hat to zero
        #Propose adding new clusters 
        #Accept or reject proposal
        print("Proba Z:",PZ_hat.loc[i,])
        print("Z_hat new:",Z_hat.loc[i,])
        for k in k_plus:
            if Z_hat.loc[i,k] == 0:
                u = np.random.uniform(0,1,1)  #PZ_hat is not normalized to [0,1]
                if u[0]<PZ_hat.loc[i,k]:
                    Z_hat.loc[i,k]=1
                    print(i,k)
        print("----------------")
    """
    #Resample theta|Z,X using MH proposal
    prob_A=0
    omega=0.5
    for k in range(K):
        for d in range(0,2):
            #generate proposal theta'(mu' and sigma') based on Beta(omega*theta,omega*(1-theta))
            T_prop=beta.rvs(omega*theta[k,d],omega*(1-theta[k,d]))
            T_mh=beta.rvs(omega*T_prop[k,d],omega*(1-T_prop[k,d]))
            #likelihood of x_d | ...
            lh_mh,lh_prop=np.zeros([2,3])
            for k_ in range(K):
                lh_mh[k_]=likelihood_bern(X,Z_hat,T_mh,i,k_)
                lh_prop[k_]=likelihood_bern(X,Z_hat,T_prop,i,k_)
            #priors beta
            xxx
    """

Z initial: 0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     1.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    1.0
14    0.0
15    0.0
16    0.0
17    0.0
Name: 1357, dtype: float64
k+= [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17]
Proba Z: 0             0.000000
1             0.000000
2             0.000000
3             0.000000
4             0.000000
5             0.000000
6             0.000000
7             0.000000
8             0.000000
9             0.000000
10            0.000000
11            0.000000
12            0.000000
13            0.000000
14            0.000000
15            0.000000
16            0.000000
17            0.000000
(1357, 0)     0.008811
(1357, 1)     0.001017
(1357, 2)     0.000339
(1357, 3)     0.000339
(1357, 4)     0.005761
(1357, 5)     0.003389
(1357, 6)     0.000339
(1357, 8)     0.000000
(1357, 9)     0.000339
(1357, 10)    0.001017
(1357, 11)    0.000000
(1357, 12)    0.000678
(1357, 14)    0.002033


In [None]:
U = np.dot(Z,Z.T)
print(U)
U_hat = np.dot(Z_hat,Z_hat.T)
print(U_hat)

### 4. $U$ and $\hat{U}$ comparison

In [None]:
fig = figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
ax1.title.set_text('U')
ax2.title.set_text('$\hat{U}$')

ax1.spy(U)
ax2.spy(U_hat)

show()