In [None]:
import numpy as np
from glob import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from multiinstance.likelihood_method import getChildren, treeNegativeLogLikelihood,runAlgorithm, plotDistrs, plotMAE
from multiinstance.data.syntheticData import buildDataset
import scipy.stats as ss

from multiinstance.data.realData import buildDataset as buildReal

from multiinstance.utils import addTransformScores, addGlobalEsts, getBagAlphaHats,getEsts,getTransformScores

In [None]:
# export likelihood_method
def prepDS(dsi,numbootstraps=10, useAlphaMax=True):
    dsi = addTransformScores(dsi)
    dsi = addGlobalEsts(dsi,useAlphaMax=useAlphaMax,reps=numbootstraps)
    dsi.alphaHats,dsi.curves = getBagAlphaHats(dsi,
                                               numbootstraps=numbootstraps,
                                               useAlphaMax=useAlphaMax)

    dsi.numLeaves = dsi.alphaHats.shape[0]
    dsi.numNodes = dsi.numLeaves + (dsi.numLeaves - 1)
    dsi.numInternal = dsi.numNodes - dsi.numLeaves
    NEstimates = dsi.alphaHats.shape[1]
    dsi.mu = np.zeros(dsi.alphaHats.shape[0])
    dsi.sigma = np.ones(dsi.numNodes)
#     dsi.leafN = np.ones_like(dsi.mu) * dsi.alphaHats.shape[1]
    dsi.leafN = dsi.numU
    dsi.treeAlphaHats = [[] for _ in range(dsi.numNodes)]

    for nodeNum in range(dsi.numInternal):
        children = getChildren(nodeNum, dsi.numInternal)
        leafNums = children - dsi.numInternal
        _,unlabeled = list(zip(*[getTransformScores(dsi,n) for n in leafNums]))
        pos,_ = list(zip(*[getTransformScores(dsi,n) for n in range(dsi.N)]))
        pos = np.concatenate(pos).reshape((-1,1))
        unlabeled = np.concatenate(unlabeled).reshape((-1,1))
#         NEstimates = int(np.sum([numEstimates[l] for l in leafNums]))
        
        dsi.treeAlphaHats[nodeNum],_ = getEsts(pos, unlabeled, NEstimates,useAlphaMax=useAlphaMax)
        _, dsi.sigma[nodeNum] = ss.norm.fit(dsi.treeAlphaHats[nodeNum])

    for leafNum in range(dsi.numLeaves):
        nodeNum = leafNum + dsi.numInternal
        dsi.treeAlphaHats[nodeNum] = dsi.alphaHats[leafNum]
        dsi.mu[leafNum],dsi.sigma[nodeNum] = ss.norm.fit(dsi.treeAlphaHats[nodeNum])
    return dsi



In [None]:
from sklearn.metrics import roc_auc_score
def posteriorCorrection(tau, alpha, S0S1):
    post =  alpha * S0S1 * (tau / (1 - tau))
    post[np.isinf(post)] = 1
    return post

def correctedAUC(ds,bagAlphaHats,):
    _, tauArrays = list(zip(*[getTransformScores(ds,i) for i in range(ds.N)]))
    S0_S1 = ds.numU/ds.numP
    posteriors = [posteriorCorrection(tau,alphaHat, s0s1) for tau,alphaHat,s0s1 in zip(tauArrays,
                                                                                       bagAlphaHats,
                                                                                       S0_S1)]
    posteriorVals = np.concatenate(posteriors)
    hiddenLabels = np.concatenate([ds.hiddenLabels[i][:ds.numU[i]] for i in range(ds.N)])
    return roc_auc_score(hiddenLabels, posteriorVals)

In [None]:
def run(fileName,useAlphaMax=False):
    dsn = fileName.split("/")[-1].replace(".mat","")
    dsi = buildReal(fileName,4,
                       alphaDistr=lambda: np.random.uniform(.05,.95),
                      nPDistr=lambda: 1 + np.random.poisson(25),
                      nUDistr=lambda: 1 + np.random.poisson(75))
    dsi = prepDS(dsi,numbootstraps=25,useAlphaMax=useAlphaMax)
    dsi, mus,sigmas,maes,NLL = runAlgorithm(dsi,
                                            NIter=2500,
                                            rlambda=1,)
    localAE = maes[0] * dsi.N
    likelihoodAE = maes[-1] * dsi.N
    globalAE = np.abs(dsi.globalAlphaHats.mean() - dsi.trueAlphas.flatten()).sum()
    maeFig = plotMAE(maes,dsi)
    plt.show()
    dstrFig = plotDistrs(dsi,mus,sigmas)
    plt.show()
    nllFig,ax= plt.subplots()
    ax.plot(NLL)
    plt.show()
    if useAlphaMax:
        mode = "alphamax"
    else:
        mode = "distcurve"
    maeFig.savefig("figs/nb_24/{}_{}_mae.pdf".format(dsn,mode),format="pdf")
    nllFig.savefig("figs/nb_24/{}_{}_nll.pdf".format(dsn,mode),format="pdf")
    dstrFig.savefig("figs/nb_24/{}_{}_distr.pdf".format(dsn,mode),format="pdf")
    return localAE, globalAE, likelihoodAE,dsi.N

In [None]:
def run(fileName,numbags=16, numbootstraps=25,useAlphaMax=False, meanPosSize=10, meanUnlabeledSize=20,NIter=1000):
    dsn = fileName.split("/")[-1].replace(".mat","")
    dsi = buildReal(fileName,numbags,
                       alphaDistr=lambda: np.random.uniform(.05,.95),
                      nPDistr=lambda: 1 + np.random.poisson(meanPosSize),
                      nUDistr=lambda: 1 + np.random.poisson(meanUnlabeledSize))
    dsi = prepDS(dsi,numbootstraps=numbootstraps,useAlphaMax=useAlphaMax)
    dsi, mus,sigmas,maes,NLL = runAlgorithm(dsi,
                                            NIter=NIter,
                                            rlambda=1,)
    localAE = maes[0] * dsi.N
    likelihoodAE = maes[-1] * dsi.N
    globalAE = np.abs(dsi.globalAlphaHats.mean() - dsi.trueAlphas.flatten()).sum()
    localAUC = correctedAUC(dsi, mus[0])
    likelihoodAUC = correctedAUC(dsi,mus[-1])
    globalAUC = correctedAUC(dsi,np.ones(dsi.N)*dsi.globalAlphaHats.mean())
    maeFig = plotMAE(maes,dsi)
    plt.show()
    dstrFig = plotDistrs(dsi,mus,sigmas)
    plt.show()
    nllFig,ax= plt.subplots()
    ax.plot(NLL)
    plt.show()
    if useAlphaMax:
        mode = "alphamax"
    else:
        mode = "distcurve"
    maeFig.savefig("figs/nb_24/{}_{}_mae.pdf".format(dsn,mode),format="pdf")
    nllFig.savefig("figs/nb_24/{}_{}_nll.pdf".format(dsn,mode),format="pdf")
    dstrFig.savefig("figs/nb_24/{}_{}_distr.pdf".format(dsn,mode),format="pdf")
    return localAE, globalAE, likelihoodAE,dsi.N, localAUC, likelihoodAUC, globalAUC

## Moderate Sized Bags
Alphamax is clearly inferior

In [None]:
# fileNames = glob("/data/dzeiberg/ClassPriorEstimation/rawDatasets/*.mat")
fileNames = glob("/ssdata/ClassPriorEstimationPrivate/data/rawDatasets/*.mat")
absErrs = {"distcurve":{"local":0,
           "global":0,
           "likelihood":0},
           "alphamax":{"local":0,
           "global":0,
           "likelihood":0}}
N = 0
for fileName in tqdm(fileNames,total=len(fileNames)):
    print(fileName)
    # RUN DISTCURVE
    print("DistCurve")
    localAE, globalAE, likelihoodAE,ni,localAUC,likelihoodAUC,globalAUC = run(fileName, useAlphaMax=False)
    # Log Results
    N += ni
    absErrs["distcurve"]["local"] += localAE
    absErrs["distcurve"]["global"] += globalAE
    absErrs["distcurve"]["likelihood"] += likelihoodAE
    for k,v in absErrs["distcurve"].items():
        print(k, "{:.3f}".format(v/N))
    # AlphaMax
    print("Alphamax")
    localAE, globalAE, likelihoodAE,_ = run(fileName, useAlphaMax=True)
    # Log Results
    absErrs["alphamax"]["local"] += localAE
    absErrs["alphamax"]["global"] += globalAE
    absErrs["alphamax"]["likelihood"] += likelihoodAE
    for k,v in absErrs["alphamax"].items():
        print(k, "{:.3f}".format(v/N))
    

In [None]:
.014 / .094

In [None]:
# fileNames = glob("/data/dzeiberg/ClassPriorEstimation/rawDatasets/*.mat")
fileNames = glob("/ssdata/ClassPriorEstimationPrivate/data/rawDatasets/*.mat")
absErrs = {"distcurve":{"local":0,
           "global":0,
           "likelihood":0}}
aucs = {"distcurve":{"local":[],
           "global":[],
           "likelihood":[]}}
N = 0
for fileName in tqdm(fileNames,total=len(fileNames)):
    print(fileName)
    # RUN DISTCURVE
    print("DistCurve")
    localAE, globalAE, likelihoodAE,ni,localAUC,likelihoodAUC,globalAUC = run(fileName,
                                                                              useAlphaMax=False,
                                                                              numbags=16,
                                                                              numbootstraps=100,
                                                                              NIter=2500,
                                                                              meanPosSize=125,
                                                                              meanUnlabeledSize=175)
    # Log Results
    N += ni
    absErrs["distcurve"]["local"] += localAE
    absErrs["distcurve"]["global"] += globalAE
    absErrs["distcurve"]["likelihood"] += likelihoodAE
    aucs["distcurve"]["local"].append(localAUC)
    aucs["distcurve"]["likelihood"].append(likelihoodAUC)
    aucs["distcurve"]["global"].append(globalAUC)
    print("MAE")
    for k,v in absErrs["distcurve"].items():
        print(k, "{:.3f}".format(v/N))
    print("AUC")
    for k,v in aucs["distcurve"].items():
        print(k, "{:.3f}".format(np.mean(v)))

In [None]:
# fileNames = glob("/data/dzeiberg/ClassPriorEstimation/rawDatasets/*.mat")
fileNames = glob("/ssdata/ClassPriorEstimationPrivate/data/rawDatasets/*.mat")
absErrs = {"distcurve":{"local":0,
           "global":0,
           "likelihood":0}}
aucs = {"distcurve":{"local":[],
           "global":[],
           "likelihood":[]}}
N = 0
for fileName in tqdm(fileNames,total=len(fileNames)):
    print(fileName)
    # RUN DISTCURVE
    print("DistCurve")
    localAE, globalAE, likelihoodAE,ni,localAUC,likelihoodAUC,globalAUC = run(fileName,
                                                                              useAlphaMax=False,
                                                                              numbags=16,
                                                                              numbootstraps=100,
                                                                              NIter=2500,
                                                                              meanPosSize=125,
                                                                              meanUnlabeledSize=175)
    # Log Results
    N += ni
    absErrs["distcurve"]["local"] += localAE
    absErrs["distcurve"]["global"] += globalAE
    absErrs["distcurve"]["likelihood"] += likelihoodAE
    aucs["distcurve"]["local"].append(localAUC)
    aucs["distcurve"]["likelihood"].append(likelihoodAUC)
    aucs["distcurve"]["global"].append(globalAUC)
    print("MAE")
    for k,v in absErrs["distcurve"].items():
        print(k, "{:.3f}".format(v/N))
    print("AUC")
    for k,v in aucs["distcurve"].items():
        print(k, "{:.3f}".format(np.mean(v)))

4 bags 10 P, 20 U alpha~U(.05,.95)

MAE
local 0.211
global 0.239
likelihood 0.195
AUC
local 0.743
global 0.767
likelihood 0.747

16 Bags 125P 175U alpha~U(.05,.95)

MAE
local 0.056
global 0.227
likelihood 0.055
AUC
local 0.952
global 0.936
likelihood 0.951

In [None]:
(.175-.153) / .175