In [None]:
# default_exp likelihoodMethods

In [None]:
# export
import autograd
from autograd import grad,jacobian,hessian
from autograd.scipy import stats as agss
import autograd.numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import scipy.stats as ss
import os
from scipy.optimize import minimize
from glob import glob

from multiinstance.likelihoodMethods import *

import scipy.stats as ss

from multiinstance.data.syntheticData import buildDataset
from multiinstance.utils import *
from multiinstance.agglomerative_clustering import AgglomerativeClustering

os.sched_setaffinity(0,set(range(10,20)))

In [None]:
def prepDS(dsi):
    dsi = addTransformScores(dsi)
    dsi = addGlobalEsts(dsi)
    dsi.alphaHats,dsi.curves = getBagAlphaHats(dsi,numbootstraps=100)

    dsi.numLeaves = dsi.alphaHats.shape[0]
    dsi.numNodes = dsi.numLeaves + (dsi.numLeaves - 1)
    dsi.numInternal = dsi.numNodes - dsi.numLeaves

    dsi.mu = np.zeros(dsi.alphaHats.shape[0])
    dsi.sigma = np.ones(dsi.numNodes)
    dsi.leafN = np.ones_like(dsi.mu) * dsi.alphaHats.shape[1]
    dsi.treeAlphaHats = [[] for _ in range(dsi.numNodes)]

    for nodeNum in range(dsi.numInternal):
        children = getChildren(nodeNum, dsi.numInternal)
        leafNums = children - dsi.numInternal
        _,unlabeled = list(zip(*[getTransformScores(dsi,n) for n in leafNums]))
        pos,_ = list(zip(*[getTransformScores(dsi,n) for n in range(dsi.N)]))
        pos = np.concatenate(pos).reshape((-1,1))
        unlabeled = np.concatenate(unlabeled).reshape((-1,1))
        NEstimates = int(np.sum([dsi.leafN[l] for l in leafNums]))
        dsi.treeAlphaHats[nodeNum],_ = getEsts(pos, unlabeled, NEstimates)
        _, dsi.sigma[nodeNum] = ss.norm.fit(dsi.treeAlphaHats[nodeNum])

    for leafNum in range(dsi.numLeaves):
        nodeNum = leafNum + dsi.numInternal
        dsi.treeAlphaHats[nodeNum] = dsi.alphaHats[leafNum]
        dsi.mu[leafNum],dsi.sigma[nodeNum] = ss.norm.fit(dsi.treeAlphaHats[nodeNum])
    return dsi

In [None]:
def runAlgorithm(dsi):

    maes = []
    lr = .001

    gradNLL_mu = grad(treeNegativeLogLikelihood(dsi.treeAlphaHats,dsi.leafN),0)
    gradNLL_sigma = grad(treeNegativeLogLikelihood(dsi.treeAlphaHats,dsi.leafN),1)
    mus = []
    sigmas = []
    NIter= 1000
    for i in tqdm(range(NIter),total=NIter):
        if not i % 1500:
            lr = lr * .5
        deltaMu = gradNLL_mu(dsi.mu,dsi.sigma)
        deltaSigma = gradNLL_sigma(dsi.mu,dsi.sigma)
        mus.append(dsi.mu)
        sigmas.append(dsi.sigma)
        dsi.mu = dsi.mu - lr * deltaMu
        dsi.sigma = dsi.sigma - lr * deltaSigma
        maes.append(np.mean(np.abs(dsi.mu - dsi.trueAlphas.flatten())))
    return dsi,mus,sigmas,maes

In [None]:
def plotMAE(maes,dsi):
    plt.plot(maes,label="likelihood method")
    plt.hlines(np.mean(np.abs(dsi.globalAlphaHats.mean() - dsi.trueAlphas.flatten())),
               0,len(maes),
               color="black",label="global")
    plt.legend()

In [None]:
def plotDistrs(ds,mus_):
    Nrows = int(np.ceil(np.log2(ds.N))) + 1
    fig,ax= plt.subplots(nrows=Nrows,ncols=ds.N,figsize=(5 * ds.N,5 * ds.N))
    for row in range(Nrows):
        for col in range(2**row):
            idx = col
            if row > 0:
                idx += 2**(row) - 1
            ax[row,col].hist(ds.treeAlphaHats[idx],density=True)
            children = getChildren(0,1)
            leafIndices = getChildren(idx, ds.N - 1).astype(int) - (ds.N-1)
            ln = ds.numU[leafIndices]
            # Final
            mu = np.dot(ds.mu[leafIndices],ln)/np.sum(ln)
            sigma = ds.sigma[idx]
            pdf = ss.norm.pdf(np.arange(0,1,.01),
                              loc=mu,scale=sigma)
            ax[row,col].plot(np.arange(0,1,.01),pdf,color="green",alpha=.5)
            # Original
            mu = np.dot(mus_[0][leafIndices],ln)/np.sum(ln)
            sigma = sigmas[0][idx]
            pdf = ss.norm.pdf(np.arange(0,1,.01),
                              loc=mu,scale=sigma)
            ax[row,col].plot(np.arange(0,1,.01),pdf,color="red",alpha=.5)
            ax[row,col].set_xlim(0,1)
            if row == Nrows - 1:
                ax[row,col].vlines(ds.trueAlphas[leafIndices[0]],0,1,color="black")

## Small Bags

In [None]:
dsi = buildDataset(1,nP=10,nU=100,posMean=1,negMean=2,cov=1,alphaDistr=lambda: np.random.choice([.1]))
ds2 = buildDataset(1, nP=10,nU=100,posMean=1,negMean=2,cov=1,alphaDistr=lambda: np.random.choice([.8]))
dsi.merge(ds2)

dsi = prepDS(dsi)

dsi, mus,sigmas,maes = runAlgorithm(dsi)

plotMAE(maes,dsi)

plotDistrs(dsi,mus)

## Large Bags

In [None]:
dsi = buildDataset(1,nP=1000,nU=10000,posMean=1,negMean=2,cov=1,alphaDistr=lambda: np.random.choice([.1]))
ds2 = buildDataset(1, nP=1000,nU=10000,posMean=1,negMean=2,cov=1,alphaDistr=lambda: np.random.choice([.8]))
dsi.merge(ds2)

dsi = prepDS(dsi)

dsi, mus,sigmas,maes = runAlgorithm(dsi)

plotMAE(maes,dsi)

plotDistrs(dsi,mus)

# Moderate Number of Bags

In [None]:
dsi = buildDataset(4,nP=100,nU=1000)
dsi = prepDS(dsi)

dsi, mus,sigmas,maes = runAlgorithm(dsi)

plotMAE(maes,dsi)

plotDistrs(dsi,mus)

In [None]:
dsi.mu

In [None]:
dsi.sigma