In [None]:
import autograd
from autograd import grad,jacobian,hessian
from autograd.scipy import stats as agss
import autograd.numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import scipy.stats as ss
import os
from glob import glob
from multiinstance.data.syntheticData import buildDataset
from multiinstance.utils import *

In [None]:
# export
def logLikelihood(xi,mu,sigma,normalize):
    LL = (-len(xi)/2 * np.log(2*np.pi*(sigma + 1e-8)**2) - (1/(2*(sigma + 1e-8)**2)) * np.sum((xi - mu)**2))
    if normalize:
        LL = LL * (1/len(xi))
    return LL

def getChildren(idx,N):
    if idx > N - 1:
        return np.array([idx])
    left = 2 * idx + 1
    right = left + 1
    
    return np.concatenate([getChildren(left,N),getChildren(right,N)])

def treeNegativeLogLikelihood(x,leafN,normalize=True,rlambda=.5,lambdaTilde=.5):
    def LL(bagMeans,bagSigma):
        NBags = len(bagSigma)
        NInternal_Nodes = np.floor(NBags/2)
        NLeaves = NBags - NInternal_Nodes
        ll = 0
        Nrows = int(np.ceil(np.log2(NLeaves))) + 1
        for row in range(Nrows):
            for col in range(2**row):
                idx = col
                if row > 0:
                    idx += 2**(row) - 1                
                leafIndices = (getChildren(idx, NInternal_Nodes) - NInternal_Nodes).astype(int)
                ln = leafN[leafIndices]
                mu_tilde = np.dot(bagMeans[leafIndices],ln)/np.sum(ln)
                mu = bagMeans[idx]
                sigma = bagSigma[idx]
                ll = ll + (rlambda**row) * logLikelihood(x[idx],
                                                         mu,
                                                         sigma,
                                                         normalize) - lambdaTilde * np.linalg.norm(mu - mu_tilde)**2
        return -1 * ll
    return LL

In [None]:
def prepDS(dsi,numbootstraps=100):
    dsi = addTransformScores(dsi)
    dsi = addGlobalEsts(dsi)
    dsi.alphaHats,dsi.curves = getBagAlphaHats(dsi,numbootstraps=numbootstraps)

    dsi.numLeaves = dsi.alphaHats.shape[0]
    dsi.numNodes = dsi.numLeaves + (dsi.numLeaves - 1)
    dsi.numInternal = dsi.numNodes - dsi.numLeaves

    dsi.mu = np.zeros(dsi.numNodes)
    dsi.sigma = np.ones(dsi.numNodes)
    dsi.leafN = np.ones_like(dsi.mu) * dsi.alphaHats.shape[1]
    dsi.treeAlphaHats = [[] for _ in range(dsi.numNodes)]

    for nodeNum in range(dsi.numNodes):
        children = getChildren(nodeNum, dsi.numInternal)
        leafNums = children - dsi.numInternal
        _,unlabeled = list(zip(*[getTransformScores(dsi,n) for n in leafNums]))
        pos,_ = list(zip(*[getTransformScores(dsi,n) for n in range(dsi.N)]))
        pos = np.concatenate(pos).reshape((-1,1))
        unlabeled = np.concatenate(unlabeled).reshape((-1,1))
        NEstimates = int(np.sum([dsi.leafN[l] for l in leafNums]))
        if nodeNum < dsi.numInternal:
            dsi.treeAlphaHats[nodeNum],_ = getEsts(pos, unlabeled, NEstimates)
        else:
            dsi.treeAlphaHats[nodeNum] = dsi.alphaHats[leafNums[0]]
        dsi.mu[nodeNum], dsi.sigma[nodeNum] = ss.norm.fit(dsi.treeAlphaHats[nodeNum])
    return dsi

def runAlgorithm(dsi,normalize=True,NIter=1000,rlambda=.5,lambdaTilde=.5):

    maes = [np.mean(np.abs(dsi.mu[-dsi.numLeaves:] - dsi.trueAlphas.flatten()))]
    lr = .01

    gradNLL_mu = grad(treeNegativeLogLikelihood(dsi.treeAlphaHats,
                                                dsi.leafN,
                                                normalize=normalize,
                                                rlambda=rlambda,lambdaTilde=lambdaTilde),0)
    gradNLL_sigma = grad(treeNegativeLogLikelihood(dsi.treeAlphaHats,
                                                   dsi.leafN,
                                                   normalize=normalize,
                                                   rlambda=rlambda,lambdaTilde=lambdaTilde),1)
    mus = [dsi.mu]
    nllfunc = treeNegativeLogLikelihood(dsi.treeAlphaHats, dsi.leafN)
    negLogLikelihood = [nllfunc(dsi.mu,dsi.sigma)]
    sigmas = [dsi.sigma]
    for i in tqdm(range(NIter),total=NIter):
        if not i % 1500:
            lr = lr * .5
        deltaMu = gradNLL_mu(dsi.mu,dsi.sigma)
        deltaSigma = gradNLL_sigma(dsi.mu,dsi.sigma)
        mus.append(dsi.mu)
        sigmas.append(dsi.sigma)
        dsi.mu = dsi.mu - lr * deltaMu
        dsi.mu[dsi.mu <= 0] = .01
        negLogLikelihood.append(nllfunc(dsi.mu, dsi.sigma))
        dsi.sigma = dsi.sigma - lr * deltaSigma
        maes.append(np.mean(np.abs(dsi.mu[-dsi.numLeaves:] - dsi.trueAlphas.flatten())))
    return dsi,mus,sigmas,maes,logLikelihood

def plotMAE(maes,dsi):
    fig,ax = plt.subplots()
    ax.plot(maes,label="likelihood method")
    ax.hlines(np.mean(np.abs(dsi.globalAlphaHats.mean() - dsi.trueAlphas.flatten())),
               0,len(maes),
               color="black",label="global")
    ax.legend()
    return fig

def plotDistrs(ds,mus_,sigmas_):
    Nrows = int(np.ceil(np.log2(ds.N))) + 1
    fig,ax= plt.subplots(nrows=Nrows,ncols=ds.N,figsize=(5 * ds.N,5 * ds.N))
    for row in range(Nrows):
        for col in range(2**row):
            idx = col
            if row > 0:
                idx += 2**(row) - 1
            ax[row,col].hist(ds.treeAlphaHats[idx],density=True)
            children = getChildren(0,1)
            leafIndices = getChildren(idx, ds.N - 1).astype(int) - (ds.N-1)
            ln = ds.numU[leafIndices]
            # Final
#             mu = np.dot(ds.mu[leafIndices],ln)/np.sum(ln)
            mu = ds.mu[idx]
            sigma = ds.sigma[idx]
            pdf = ss.norm.pdf(np.arange(0,1,.01),
                              loc=mu,scale=sigma)
            ax[row,col].plot(np.arange(0,1,.01),pdf,color="green",alpha=.5,label="final")
            ax[row,col].vlines(mu,0,1,color="green",label="alpha hat")
            # Original
#             mu = np.dot(mus_[0][idx],ln)/np.sum(ln)
            mu = mus_[0][idx]
            sigma = sigmas_[0][idx]
            pdf = ss.norm.pdf(np.arange(0,1,.01),
                              loc=mu,scale=sigma)
            ax[row,col].plot(np.arange(0,1,.01),pdf,color="red",alpha=.5,label="og")

            if row == Nrows - 1:
                ax[row,col].vlines(ds.trueAlphas[leafIndices[0]],0,1,color="black",label="alpha")
            ax[row,col].legend()
    return fig

In [None]:
dsi = buildDataset(4,nP=10,nU=100,posMean=1,negMean=2,cov=1,
                   alphaDistr=lambda: np.random.choice([.10]))
# ds2 = buildDataset(1, nP=10,nU=100,posMean=1,negMean=2,cov=1,
#                    alphaDistr=lambda: np.random.choice([.90]))
# dsi.merge(ds2)

dsi = prepDS(dsi,numbootstraps=100)
print(np.mean(np.abs(dsi.mu[-dsi.numLeaves:] - dsi.trueAlphas.flatten())))
dsi, mus,sigmas,maes,logLikelihood = runAlgorithm(dsi,
                                                  NIter=100,
                                                  rlambda=1,
                                                  lambdaTilde=1)
maefig1 = plotMAE(maes,dsi)
fig1 = plotDistrs(dsi,mus,sigmas)

In [None]:
dsi = buildDataset(1,nP=10,nU=30,posMean=1,negMean=2,cov=1,
                   alphaDistr=lambda: np.random.choice([.10]))
ds2 = buildDataset(1, nP=10,nU=30,posMean=1,negMean=2,cov=1,
                   alphaDistr=lambda: np.random.choice([.90]))
dsi.merge(ds2)
# dsi = buildDataset(8,nP=5,nU=10,
#                    posMean=1,negMean=2,cov=1,
#                    alphaDistr=lambda: np.random.uniform(low=.1,high=.9))


dsi = prepDS(dsi)
dsi, mus,sigmas,maes,logLikelihood = runAlgorithm(dsi,
                                                  NIter=5000,
                                                  rlambda=1)
maefig2 = plotMAE(maes,dsi)
fig2 = plotDistrs(dsi,mus)

In [None]:

# dsi = buildDataset(16,nP=10,nU=30,
#                    posMean=1,negMean=2,cov=1,
#                    alphaDistr=lambda: np.random.uniform(low=.1,high=.9))


# dsi = prepDS(dsi)
dsi, mus,sigmas,maes,logLikelihood = runAlgorithm(dsi,
                                                  NIter=5000,
                                                  rlambda=2)
maefig3 = plotMAE(maes,dsi)
fig3 = plotDistrs(dsi,mus)

In [None]:
mkdir figs/nb_19

In [None]:
fig3.savefig("figs/nb_19/fig3.pdf",format="pdf")

# Real

In [None]:
from multiinstance.data.realData import buildDataset as buildReal

In [None]:
fileNames = glob("/data/dzeiberg/ClassPriorEstimation/rawDatasets/*.mat")
for fileName in tqdm(fileNames,total=len(fileNames)):
    dsn = fileName.split("/")[-1].replace(".mat","")
    dsi = buildReal(fileName,8,
                       alphaDistr=lambda: np.random.uniform(.01,.95),
                      nPDistr=lambda: 1 + np.random.poisson(25),
                      nUDistr=lambda: 1 + np.random.poisson(75))
    dsi = prepDS(dsi)
    dsi, mus,sigmas,maes,logLikelihood = runAlgorithm(dsi,
                                                      NIter=5000,
                                                      rlambda=.85)
    maefig = plotMAE(maes,dsi)
    plt.show()
    dstrFig = plotDistrs(dsi,mus)
    plt.show()
    maefig.savefig("figs/nb_19/real/{}_mae.pdf".format(dsn),format="pdf")
    fig5 = plotDistrs(dsi,mus)
    
    fig5.savefig("figs/nb_19/real/{}_distrs.pdf".format(dsn),format="pdf")