In [None]:
import autograd
from autograd import grad,jacobian,hessian
from autograd.scipy import stats as agss
import autograd.numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import scipy.stats as ss
import os
from glob import glob
from multiinstance.data.syntheticData import buildDataset
from multiinstance.utils import *

In [None]:
os.sched_setaffinity(0, range(0,20))

In [None]:
# export
def logLikelihood(xi,a,b):
    assert a*b > 0
    LL = np.sum(agss.beta.logpdf(xi,a,b))
    return LL

def getChildren(idx,N):
    if idx > N - 1:
        return np.array([idx])
    left = 2 * idx + 1
    right = left + 1
    
    return np.concatenate([getChildren(left,N),getChildren(right,N)])

def treeNegativeLogLikelihood(x,leafN,normalize=True,rlambda=.5):
    def LL(leafA,bagB):
        NBags = len(bagB)
        NInternal_Nodes = np.floor(NBags/2)
        NLeaves = len(leafA)
        ll = 0
        Nrows = int(np.ceil(np.log2(NLeaves))) + 1
        for row in range(Nrows):
            for col in range(2**row):
                idx = col
                if row > 0:
                    idx += 2**(row) - 1   
                childrenIDXs = getChildren(idx, NInternal_Nodes)
                leafIndices = (childrenIDXs - NInternal_Nodes).astype(int)
                ln = leafN[leafIndices]
                leafMeans = leafA[leafIndices] / (leafA[leafIndices] + bagB[childrenIDXs])
                c = np.dot(leafMeans,ln)/np.sum(ln)
                b = bagB[idx]
                
                a = (c / (1 - c)) * b
                
                ll = ll + (rlambda**row) * logLikelihood(x[idx],a,b)
        return -1 * ll
    return LL

In [None]:
def prepDS(dsi,numbootstraps=1000):
    dsi = addTransformScores(dsi)
    dsi = addGlobalEsts(dsi)
    dsi.alphaHats,dsi.curves = getBagAlphaHats(dsi,numbootstraps=numbootstraps)

    dsi.numLeaves = dsi.alphaHats.shape[0]
    dsi.numNodes = dsi.numLeaves + (dsi.numLeaves - 1)
    dsi.numInternal = dsi.numNodes - dsi.numLeaves

    dsi.leafA = np.zeros(dsi.alphaHats.shape[0])
    dsi.bagB = np.ones(dsi.numNodes)
    dsi.leafN = np.ones_like(dsi.leafA) * dsi.alphaHats.shape[1]
    dsi.treeAlphaHats = [[] for _ in range(dsi.numNodes)]

    for nodeNum in range(dsi.numInternal):
        children = getChildren(nodeNum, dsi.numInternal)
        leafNums = children - dsi.numInternal
        _,unlabeled = list(zip(*[getTransformScores(dsi,n) for n in leafNums]))
        pos,_ = list(zip(*[getTransformScores(dsi,n) for n in range(dsi.N)]))
        pos = np.concatenate(pos).reshape((-1,1))
        unlabeled = np.concatenate(unlabeled).reshape((-1,1))
        NEstimates = int(np.sum([dsi.leafN[l] for l in leafNums]))
        dsi.treeAlphaHats[nodeNum],_ = getEsts(pos, unlabeled, NEstimates)
        vals = np.array(dsi.treeAlphaHats[nodeNum])
#         vals[vals <= 0] = 0.001
        
        _, dsi.bagB[nodeNum],loc,scale = ss.beta.fit(vals)
        dsi.bagB[nodeNum] = max(1, dsi.bagB[nodeNum])
        vals = (vals - loc) / scale
        dsi.treeAlphaHats[nodeNum] = vals

    for leafNum in range(dsi.numLeaves):
        nodeNum = leafNum + dsi.numInternal
        dsi.treeAlphaHats[nodeNum] = dsi.alphaHats[leafNum]
        vals = np.array(dsi.treeAlphaHats[nodeNum])
#         vals[vals <= 0] = 0.001
        
        dsi.leafA[leafNum],_,loc,scale = ss.beta.fit(vals)
        dsi.leafA[leafNum] = max(1,dsi.leafA[leafNum])
        vals = (vals - loc) / scale
        dsi.treeAlphaHats[nodeNum] = vals
    return dsi

def runAlgorithm(dsi,normalize=True,NIter=1000,rlambda=.5):
    
    leafMeans = dsi.leafA / (dsi.leafA + dsi.bagB[-dsi.numLeaves:])
    maes = [np.mean(np.abs(leafMeans - dsi.trueAlphas.flatten()))]
    lr = .0001

    gradNLL_mu = grad(treeNegativeLogLikelihood(dsi.treeAlphaHats,
                                                dsi.leafN,
                                                normalize=normalize,
                                                rlambda=rlambda),0)
    gradNLL_sigma = grad(treeNegativeLogLikelihood(dsi.treeAlphaHats,
                                                   dsi.leafN,
                                                   normalize=normalize,
                                                   rlambda=rlambda),1)
    nllfunc = treeNegativeLogLikelihood(dsi.treeAlphaHats, dsi.leafN)
    for i in tqdm(range(NIter),total=NIter):
        if not i % 1500:
            lr = lr * .5
        deltaA = gradNLL_mu(dsi.leafA,dsi.bagB)
        deltaB = gradNLL_sigma(dsi.leafA,dsi.bagB)
        dsi.leafA = dsi.leafA - lr * deltaA
        dsi.leafA = np.maximum(np.ones_like(dsi.leafA), dsi.leafA)
        dsi.bagB = dsi.bagB - lr * deltaB
        dsi.bagB = np.maximum(np.ones_like(dsi.bagB),dsi.bagB)
        leafMeans = dsi.leafA / (dsi.leafA + dsi.bagB[-dsi.numLeaves:])
        maes.append(np.mean(np.abs(leafMeans - dsi.trueAlphas.flatten())))
    return dsi,maes

def plotMAE(maes,dsi):
    fig,ax = plt.subplots()
    ax.plot(maes,label="likelihood method")
    ax.hlines(np.mean(np.abs(dsi.globalAlphaHats.mean() - dsi.trueAlphas.flatten())),
               0,len(maes),
               color="black",label="global")
    ax.legend()
    return fig

def plotDistrs(ds,):
    Nrows = int(np.ceil(np.log2(ds.N))) + 1
    fig,ax= plt.subplots(nrows=Nrows,ncols=ds.N,figsize=(5 * ds.N,5 * ds.N))
    for row in range(Nrows):
        for col in range(2**row):
            idx = col
            if row > 0:
                idx += 2**(row) - 1
            ax[row,col].hist(ds.treeAlphaHats[idx],density=True)
            childrenIDXs = getChildren(idx, dsi.numInternal)
            leafIndices = (childrenIDXs - dsi.numInternal).astype(int)
            ln = dsi.leafN[leafIndices]
            leafMeans = dsi.leafA[leafIndices] / (dsi.leafA[leafIndices] + dsi.bagB[childrenIDXs])
            c = np.dot(leafMeans,ln)/np.sum(ln)
            b = dsi.bagB[idx]

            a = (c / (1 - c)) * b
            print(a,b)
            pdf = ss.beta.pdf(np.arange(0,1,.01),a=a,b=b)
            ax[row,col].plot(np.arange(0,1,.01),pdf)
            ax[row,col].vlines(a / (a+b),0,1,color="green",alpha=.5,label="alpha hat")
            truth = np.dot(ds.trueAlphas[leafIndices].flatten(), ln)/np.sum(ln)
            ax[row,col].vlines(truth,0,1,color="black",label="alpha")
            ax[row,col].legend()
    return fig

In [None]:
# dsi = buildDataset(2,nP=10,nU=30,posMean=1,negMean=2,cov=1,alphaDistr=lambda: np.random.uniform(0.2,0.8))

# dsi = prepDS(dsi,)
# dsi, maes = runAlgorithm(dsi,NIter=1000,rlambda=2)
plotMAE(maes,dsi)
fig = plotDistrs(dsi)

In [None]:
dsi.leafA, dsi.bagB

In [None]:
plt.plot(np.arange(0,1,.01),ss.beta.pdf(np.arange(0,1,.01), a=1., b=1.07))