In [None]:
# export
import autograd
from autograd import grad,jacobian,hessian
from autograd.scipy import stats as agss
import autograd.numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import scipy.stats as ss
import os
from scipy.optimize import minimize
from glob import glob

from multiinstance.likelihoodMethods import *

import scipy.stats as ss

from multiinstance.data.syntheticData import buildDataset
from multiinstance.utils import *
from multiinstance.agglomerative_clustering import AgglomerativeClustering

os.sched_setaffinity(0,set(range(20,40)))

In [None]:
def prepDS(dsi):
    dsi = addTransformScores(dsi)
    dsi = addGlobalEsts(dsi)
    dsi.alphaHats,dsi.curves = getBagAlphaHats(dsi,numbootstraps=100)

    dsi.numLeaves = dsi.alphaHats.shape[0]
    dsi.numNodes = dsi.numLeaves + (dsi.numLeaves - 1)
    dsi.numInternal = dsi.numNodes - dsi.numLeaves

    dsi.mu = np.zeros(dsi.alphaHats.shape[0])
    dsi.sigma = np.ones(dsi.numNodes)
    dsi.leafN = np.ones_like(dsi.mu) * dsi.alphaHats.shape[1]
    dsi.treeAlphaHats = [[] for _ in range(dsi.numNodes)]

    for nodeNum in range(dsi.numInternal):
        children = getChildren(nodeNum, dsi.numInternal)
        leafNums = children - dsi.numInternal
        _,unlabeled = list(zip(*[getTransformScores(dsi,n) for n in leafNums]))
        pos,_ = list(zip(*[getTransformScores(dsi,n) for n in range(dsi.N)]))
        pos = np.concatenate(pos).reshape((-1,1))
        unlabeled = np.concatenate(unlabeled).reshape((-1,1))
        NEstimates = int(np.sum([dsi.leafN[l] for l in leafNums]))
        dsi.treeAlphaHats[nodeNum],_ = getEsts(pos, unlabeled, NEstimates)
        _, dsi.sigma[nodeNum] = ss.norm.fit(dsi.treeAlphaHats[nodeNum])

    for leafNum in range(dsi.numLeaves):
        nodeNum = leafNum + dsi.numInternal
        dsi.treeAlphaHats[nodeNum] = dsi.alphaHats[leafNum]
        dsi.mu[leafNum],dsi.sigma[nodeNum] = ss.norm.fit(dsi.treeAlphaHats[nodeNum])
    return dsi

In [None]:
def runAlgorithm(dsi,normalize=True,NIter=1000):

    maes = [np.mean(np.abs(dsi.mu - dsi.trueAlphas.flatten()))]
    lr = .001

    gradNLL_mu = grad(treeNegativeLogLikelihood(dsi.treeAlphaHats,
                                                dsi.leafN,
                                                normalize=normalize),0)
    gradNLL_sigma = grad(treeNegativeLogLikelihood(dsi.treeAlphaHats,
                                                   dsi.leafN,
                                                   normalize=normalize),1)
    mus = []
    negLogLikelihood = []
    nllfunc = treeNegativeLogLikelihood(dsi.treeAlphaHats, dsi.leafN)
    sigmas = []
    for i in tqdm(range(NIter),total=NIter):
        if not i % 1500:
            lr = lr * .5
        deltaMu = gradNLL_mu(dsi.mu,dsi.sigma)
        deltaSigma = gradNLL_sigma(dsi.mu,dsi.sigma)
        mus.append(dsi.mu)
        sigmas.append(dsi.sigma)
        dsi.mu = dsi.mu - lr * deltaMu
        dsi.mu[dsi.mu <= 0] = .01
        negLogLikelihood.append(nllfunc(dsi.mu, dsi.sigma))
        dsi.sigma = dsi.sigma - lr * deltaSigma
        maes.append(np.mean(np.abs(dsi.mu - dsi.trueAlphas.flatten())))
    return dsi,mus,sigmas,maes,logLikelihood

In [None]:
def plotMAE(maes,dsi):
    fig,ax = plt.subplots()
    ax.plot(maes,label="likelihood method")
    ax.hlines(np.mean(np.abs(dsi.globalAlphaHats.mean() - dsi.trueAlphas.flatten())),
               0,len(maes),
               color="black",label="global")
    ax.legend()
    return fig

In [None]:
def plotDistrs(ds,mus_):
    Nrows = int(np.ceil(np.log2(ds.N))) + 1
    fig,ax= plt.subplots(nrows=Nrows,ncols=ds.N,figsize=(5 * ds.N,5 * ds.N))
    for row in range(Nrows):
        for col in range(2**row):
            idx = col
            if row > 0:
                idx += 2**(row) - 1
            ax[row,col].hist(ds.treeAlphaHats[idx],density=True)
            children = getChildren(0,1)
            leafIndices = getChildren(idx, ds.N - 1).astype(int) - (ds.N-1)
            ln = ds.numU[leafIndices]
            # Final
            mu = np.dot(ds.mu[leafIndices],ln)/np.sum(ln)
            sigma = ds.sigma[idx]
            pdf = ss.norm.pdf(np.arange(0,1,.01),
                              loc=mu,scale=sigma)
            ax[row,col].plot(np.arange(0,1,.01),pdf,color="green",alpha=.5,label="final")
            # Original
            mu = np.dot(mus_[0][leafIndices],ln)/np.sum(ln)
            sigma = sigmas[0][idx]
            pdf = ss.norm.pdf(np.arange(0,1,.01),
                              loc=mu,scale=sigma)
            ax[row,col].plot(np.arange(0,1,.01),pdf,color="red",alpha=.5,label="og")
            ax[row,col].legend()
            if row == Nrows - 1:
                ax[row,col].vlines(ds.trueAlphas[leafIndices[0]],0,1,color="black")
    return fig

## Large Spread Apart

In [None]:
dsi = buildDataset(1,nP=100,nU=1000,posMean=1,negMean=2,cov=1,
                   alphaDistr=lambda: np.random.choice([.1]))
ds2 = buildDataset(1, nP=100,nU=1000,posMean=1,negMean=2,cov=1,
                   alphaDistr=lambda: np.random.choice([.8]))
dsi.merge(ds2)

dsi = prepDS(dsi)
dsi, mus,sigmas,maes,logLikelihood = runAlgorithm(dsi)
plotMAE(maes,dsi)
fig = plotDistrs(dsi,mus)

In [None]:
fig.savefig("figs/nb_18/fig1.pdf")

## Moderate Number small

In [None]:
dsi = buildDataset(4,nP=10,nU=100,alphaDistr=lambda: np.random.uniform(0.05,.95))
dsi = prepDS(dsi)
dsi, mus,sigmas,maes,nll = runAlgorithm(dsi,normalize=True,NIter=2000)
plotMAE(maes,dsi)
fig3 = plotDistrs(dsi,mus)
fig,ax = plt.subplots()
ax.plot(nll)

In [None]:
fig2.savefig("figs/nb_18/fig2.pdf",format="pdf")

## Two very small bags

In [None]:
dsi = buildDataset(2,nP=10,nU=30,alphaDistr=lambda: np.random.uniform(0.05,.95))
dsi = prepDS(dsi)
dsi, mus,sigmas,maes,nll = runAlgorithm(dsi,normalize=True,NIter=2000)
plotMAE(maes,dsi)
fig3 = plotDistrs(dsi,mus)
fig,ax = plt.subplots()
ax.plot(nll)

In [None]:
fig3.savefig("figs/nb_18/synthetic/fig3.pdf",format="pdf")

## 8 small bags

In [None]:
# dsi = buildDataset(8,nP=10,nU=30,alphaDistr=lambda: np.random.uniform(0.05,.95))
# dsi = prepDS(dsi)
dsi, mus,sigmas,maes,likelihoods = runAlgorithm(dsi,normalize=True,NIter=2000)
plotMAE(maes,dsi)
fig4 = plotDistrs(dsi,mus)

In [None]:
fig4.savefig("figs/nb_18/synthetic/fig4.pdf",format="pdf")

# Real Datasets

In [None]:
from multiinstance.data.realData import buildDataset as buildRealDS
from glob import glob

In [None]:
fileNames = glob("/data/dzeiberg/ClassPriorEstimation/rawDatasets/*.mat")
for fileName in tqdm(fileNames,total=len(fileNames)):
    dsn = fileName.split("/")[-1].replace(".mat","")
    dsi = buildRealDS(fileName,4,
                      nPDistr=lambda:np.random.choice([100]),
                      nUDistr=lambda:np.random.choice([1000]),
                      alphaDistr=lambda: np.random.uniform(0.05,.95))
    dsi = prepDS(dsi)
    dsi, mus,sigmas,maes,nll = runAlgorithm(dsi,normalize=True,NIter=100)
    maefig = plotMAE(maes,dsi)
    plt.show()
    maefig.savefig("figs/nb_18/bags_4/{}_mae.pdf".format(dsn),format="pdf")
    fig5 = plotDistrs(dsi,mus)
    plt.show()
    fig5.savefig("figs/nb_18/bags_4/{}_distrs.pdf".format(dsn),format="pdf")
    nllfig,ax = plt.subplots()
    ax.plot(nll)
    plt.show()
    nllfig.savefig("figs/nb_18/bags_4/{}_NLL.pdf".format(dsn),format="pdf")
    