In [None]:
from easydict import EasyDict

import scipy.stats as ss

from multiinstance.data.realData import buildDataset
from multiinstance.utils import *
from multiinstance.gradientMethod import g1
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

In [None]:
ds = buildDataset("/home/dzeiberg/ClassPriorEstimation/rawDatasets/wine.mat",size=2,
                  nPDistr=lambda:np.random.choice([5]),
                  nUDistr=lambda:np.random.choice([25]),
                 alphaDistr=lambda: np.random.uniform(0.05,.95))

ds = addTransformScores(ds)
ds = addGlobalEsts(ds)
ds = addBagAlphaHats(ds)
ds.trueGlobalClassPrior = ds.trueAlphas.flatten().dot(ds.numU) / ds.numU.sum()

In [None]:
def init(ds):
    d = EasyDict()
    d.means = ds.alphaHats.mean(1)
#     d.variances = ds.alphaHats.var(1)
    d.variances = np.ones(ds.N)
    d.mixingCoefs = np.ones_like(ds.globalAlphaHats) * .5
    return d

In [None]:
a = np.random.normal(loc=-2,scale=1,size=100)
b = np.random.normal(loc=3,scale=1.5,size=100)
ds.globalAlphaHats = np.random.choice(np.concatenate((a,b)),size=200)

In [None]:
plt.plot(np.arange(-10,10,.01),ss.norm.pdf(np.arange(-10,10,.01),loc=-2,scale=1))
plt.plot(np.arange(-10,10,.01),ss.norm.pdf(np.arange(-10,10,.01),loc=3,scale=1.5))

In [None]:
# def E_Step(ds,params):
#     N = len(ds.globalAlphaHats)
#     K = ds.N
#     gamma = np.zeros((N,K))
#     for k in range(K):
#         mean_k,var_k = params.means[k], params.variances[k]
#         for n in range(N):
#             print("p({:.3f} | {:.3f},{:.3f})={:.3f}".format(ds.globalAlphaHats[n],mean_k,var_k,
#                                              ss.norm.pdf(ds.globalAlphaHats[n],loc=mean_k, scale=var_k)))
#             gamma[n,k] = params.mixingCoefs[k] * ss.norm.pdf(ds.globalAlphaHats[n],
#                                                              loc=mean_k,
#                                                              scale=var_k)
#     gamma = gamma / np.tile(gamma.sum(1).reshape((-1,1)),(1,gamma.shape[1]))
#     params.gamma = gamma
# #     assert False
#     return params

def E_Step(ds,params):
    N = len(ds.globalAlphaHats)
    K = ds.N
    gamma = np.zeros((N,K))
    for n in range(N):
        x_n = ds.globalAlphaHats[n]
        for k in range(K):
            pi_k = params.mixingCoefs[k]
            mu_k = params.means[k]
            sigma_k = params.variances[k]
            pdf_x = ss.norm.pdf(x_n,loc=mu_k, scale=sigma_k)
            gamma[n,k]= pi_k * pdf_x
        gamma[n] = gamma[n] / gamma[n].sum()
    params.gamma = gamma
    return params

In [None]:
# def M_Step(ds,params):
#     N = len(ds.globalAlphaHats)
#     K = ds.N
#     Nk = params.gamma.sum(0)
#     for k in range(K):
#         params.means[k] = params.gamma[:,k].dot(ds.globalAlphaHats) / Nk[k]
#         for n in range(N):
#             params.variances[k] += params.gamma[n,k] * (ds.globalAlphaHats[n] - params.means[k])**2
#         params.variances[k] /= Nk[k]
#         params.mixingCoefs[k] = Nk[k] / N
#     return params

def M_Step(ds,params):
    K = ds.N
    N = len(ds.globalAlphaHats)
    Nk = params.gamma.sum(0)
    for k in range(K):
        # Mu
        for n in range(N):
            params.means[k] = params.means[k] + params.gamma[n,k]*ds.globalAlphaHats[n]
        params.means[k] = params.means[k] / Nk[k]
        # Sigma
        for n in range(N):
            xn = ds.globalAlphaHats[n]
            inc = params.gamma[n,k] * (xn - params.means[k])**2 
            params.variances[k] = params.variances[k] + inc
        params.variances[k] = params.variances[k] / Nk[k]
        # Pi
        params.mixingCoefs[k] = Nk[k] / N
    return params

In [None]:
params = init(ds)

In [None]:
params.means

In [None]:
means = []
gammas = []
variances = []
means.append(np.array(params.means.tolist()).reshape((1,-1)))
variances.append(np.array(params.variances.tolist()).reshape((1,-1)))
for i in range(100):
    params = E_Step(ds,params)
    params = M_Step(ds,params)
    means.append(np.array(params.means.tolist()).reshape((1,-1)))
    variances.append(np.array(params.variances.tolist()).reshape((1,-1)))
means = np.concatenate(means,axis=0)
variances = np.concatenate(variances,axis=0)

In [None]:
plt.plot(means)

In [None]:
plt.plot(variances)

# Take 2

In [None]:
ds = buildDataset("/home/dzeiberg/ClassPriorEstimation/rawDatasets/wine.mat",size=2,
                  nPDistr=lambda:np.random.choice([5]),
                  nUDistr=lambda:np.random.choice([25]),
                 alphaDistr=lambda: np.random.uniform(0.05,.95))

ds = addTransformScores(ds)
ds = addGlobalEsts(ds,reps=100)
ds = addBagAlphaHats(ds,reps=100)
ds.trueGlobalClassPrior = ds.trueAlphas.flatten().dot(ds.numU) / ds.numU.sum()

In [None]:
# define the number of points
n_samples = 100
mu1, sigma1 = ds.trueAlphas[0], .1 # mean and variance
mu2, sigma2 = ds.trueAlphas[1], .1 # mean and variance
# mu3, sigma3 = ds.trueAlphas[2], .1 # mean and variance

x1 = np.random.normal(mu1, np.sqrt(sigma1), n_samples)
x2 = np.random.normal(mu2, np.sqrt(sigma2), n_samples)
# x3 = np.random.normal(mu3, np.sqrt(sigma3), n_samples)

X = np.array(list(x1) + list(x2))
X = np.random.choice(ds.globalAlphaHats,size=200)
np.random.shuffle(X)
print("Dataset shape:", X.shape)

In [None]:
def pdf(data, mean: float, variance: float):
    # A normal continuous random variable.
    s1 = 1/(np.sqrt(2*np.pi*variance))
    s2 = np.exp(-(np.square(data - mean)/(2*variance)))
    return s1 * s2

In [None]:
# visualize the training data
bins = np.linspace(0,1,100)

plt.figure(figsize=(10,7))
plt.xlabel("$x$")
plt.ylabel("pdf")
plt.scatter(X, [0.005] * len(X), color='navy', s=30, marker=2, label="Train data")

plt.plot(bins, pdf(bins, mu1, sigma1), color='red', label="True pdf")
plt.plot(bins, pdf(bins, mu2, sigma2), color='red')
# plt.plot(bins, pdf(bins, mu3, sigma3), color='red')
plt.xlim(0,1)
plt.legend()
plt.plot()

In [None]:
# define the number of clusters to be learned
k = 2
weights = np.ones((k)) / k
means = np.random.choice(X, k)
variances = np.random.random_sample(size=k)
# print(means, variances)

In [None]:
X = np.array(X)
print(X.shape)

In [None]:
eps=1e-8
for step in range(100):
  
    if step % 1 == 0:
        plt.figure(figsize=(10,6))
        axes = plt.gca()
        plt.xlabel("$x$")
        plt.ylabel("pdf")
        plt.title("Iteration {}".format(step))
        plt.scatter(X, [0.005] * len(X), color='navy', s=30, marker=2, label="Train data")

        plt.plot(bins, pdf(bins, mu1, sigma1), color='grey', label="True pdf")
        plt.plot(bins, pdf(bins, mu2, sigma2), color='grey')
#         plt.plot(bins, pdf(bins, mu3, sigma3), color='grey')

        plt.plot(bins, pdf(bins, means[0], variances[0]), color='blue', label="Cluster 1")
        plt.plot(bins, pdf(bins, means[1], variances[1]), color='green', label="Cluster 2")
#         plt.plot(bins, pdf(bins, means[2], variances[2]), color='magenta', label="Cluster 3")

        plt.legend(loc='upper left')

        plt.savefig("img_{0:02d}".format(step), bbox_inches='tight')
        plt.show()
  
    # calculate the maximum likelihood of each observation xi
    likelihood = []


    # Expectation step
    for j in range(k):
        likelihood.append(pdf(X, means[j], variances[j]))
    likelihood = np.array(likelihood)

    b = []
    # Maximization step 
    for j in range(k):
        # use the current values for the parameters to evaluate the posterior
        # probabilities of the data to have been generanted by each gaussian    
        b.append((likelihood[j] * weights[j]) / (np.sum([likelihood[i] * weights[i] for i in range(k)], axis=0)+eps))

        # updage mean and variance
        means[j] = np.sum(b[j] * X) / (np.sum(b[j]+eps))
        variances[j] = np.sum(b[j] * np.square(X - means[j])) / (np.sum(b[j]+eps))

        # update the weights
        weights[j] = np.mean(b[j])

In [None]:
ds.trueAlphas