In [None]:
# default_exp gradientMethod

# Gradient Based Estimation

$\hat{\alpha_i}$: the local dictCurve estimate for the $i^{th}$ bag

$\hat{\alpha_{c_i}}$: the $i^{th}$ global distCurve estimate using bootstrapped sample

$w_{ji}$: the contribution of bag j to the $i^{th}$ global estimate

$\tilde{\alpha_i}$: the expected global class prior given the current contribution values and local estimates for each bag

$\tilde{\alpha_i} = \frac{w_{1i} \cdot \hat{\alpha_1} \cdot n_1 \dots w_{Ni} \cdot \hat{\alpha_N} \cdot n_N}{w_{1i} \cdot n_1 \dots w_{Ni} \cdot n_N} $



Loss for cluster $c_i$


$\mathcal{L}_{c_i} = \frac{1}{2}(\tilde{\alpha_i} - \hat{\alpha_{c_i}})^2$

    def gradientMethod(ds):
        alphaHat : init alphaHat for each bag
        alpha_C : get K global alpha estimates
        init W randomly
        for each iteration:
            # calcualte loss given the current values of alphaHat and w
            loss = lossFunction(w[:,1], alpha_C[1]) + ... + lossFunction(w[:,K], alpha_C[K])
            # update alphaHat
            alphaHat = alphaHat - eta * grad(loss)
            # calculate the loss give the current w and new alphaHats
            loss = lossFunction(1) + ... + lossFunction(K)
            w = w - eta * grad(loss)
            getMAE(alphaHat, alpha)

In [None]:
# export
from tqdm.notebook import tqdm

import autograd.numpy as np
from autograd import grad

import matplotlib.pyplot as plt


from multiinstance.dataset_utils import buildDataset
from multiinstance.utils import *
from multiinstance.distanceApproaches import *
from multiinstance.agglomerative_clustering import AgglomerativeClustering
from numba import set_num_threads

In [None]:
set_num_threads(20)

In [None]:
# export
    
def getAlphaLoss(w,n, alphaHats):
    def loss(alpha):
        lossVal = 0
        for wi, aH in zip(w, alphaHats):
            tilde = (1 / np.dot(wi,n)) * np.dot(np.multiply(alpha,wi),n)
            lossVal += .5 * np.square(aH - tilde)
        return lossVal
    return loss
    
def getWLoss(a,n, alphaHats):
    def loss(w):
        lossVal = 0
        for wi,aH in zip(w, alphaHats):
            tilde = (1 / np.dot(wi,n)) * np.dot(np.multiply(a,wi),n)
            lossVal += .5 * np.square(aH - tilde)
        return lossVal
    return loss

In [None]:
# export
def getAlphaHat(dsi,reps=10):
    P, U = list(zip(*[dsi.getBag(int(i)) for i in range(dsi.N)]))
    p = np.concatenate(P)
    u = np.concatenate(U)
    alphaHats,_ = getEsts(p,u,reps)
    return alphaHats
    

In [None]:
# export
def gradientMethod(dsi, n_epochs=100):
    alphaHats = dsi.globalAlphaHats
    # initialize values for gradient method
    a = dsi.alphaHats.mean(1)
    n = dsi.numU
    w = np.random.uniform(low=0.01,high=1,size=(len(alphaHats),
                                                n.shape[0]))
    maes = [np.mean(np.abs(a - dsi.trueAlphas.flatten()))]
    # Run gradient method
    for i in tqdm(range(n_epochs),total=n_epochs):
        alphaLossFn = getAlphaLoss(w,n,alphaHats)
        alphaGrad = grad(alphaLossFn)
        a = a - .025 * alphaGrad(a)
        wLossFn = getWLoss(a,n,alphaHats)
        wGrad = grad(wLossFn)
        w = w - .025 * wGrad(w)
        maes.append(np.mean(np.abs(a - dsi.trueAlphas.flatten())))
    return maes

In [None]:
def g2(dsi, n_epochs=100):
    alphaHats = dsi.globalAlphaHats
    # initialize values for gradient method
    a = dsi.alphaHats
    n = np.tile(dsi.numU.reshape((-1,1)), (1,a.shape[1])).flatten()
    w = np.random.uniform(low=0.01, high=1,size=(len(alphaHats),
                                                 n.shape[0]))
    maes = [np.mean(np.abs(a.mean(1) - dsi.trueAlphas.flatten()))]
    for i in tqdm(range(n_epochs), total=n_epochs):
        alphaLossFn = getAlphaLoss(w,n,alphaHats)
        alphaGrad = grad(alphaLossFn)
        a = a - alphaGrad(a.flatten()).reshape(a.shape)
        wLossFn = getWLoss(a.flatten(),n,alphaHats)
        wGrad = grad(wLossFn)
        w = w - .025 * wGrad(w)
        maes.append(np.mean(np.abs(a.mean(1) - dsi.trueAlphas.flatten())))
    return maes

In [None]:
import scipy.stats as ss

In [None]:
plt.plot(ss.beta(500,500).pdf(np.arange(0,1,.01)))

In [None]:
def bimodal():
    if np.random.binomial(1,.5):
        return np.random.beta(2,10)
    return np.random.beta(10,3)

In [None]:
def initDS(ds_size=100,n_alpha_ests=50, nP=None, nU=None,
           alphaDistr=lambda: np.random.uniform(0.1,.5),posMean=None, negMean=None,cov=None):
    dsi = buildDataset(ds_size,alphaDistr=alphaDistr, nP=nP,
                       nU=nU,posMean=posMean, negMean=negMean,cov=cov)

#     dsi = addTransformScores(dsi)
    dsi.alphaHats,dsi.curves = getBagAlphaHats(dsi,numbootstraps=n_alpha_ests)
    dsi.globalAlphaHats = getAlphaHat(dsi,reps=n_alpha_ests)
    return dsi

In [None]:
def yangDistributionDifference(posMean, negMean, cov, p=1):
        """
        Eq. (7) from :

        Yang, R., Jiang, Y., Mathews, S. et al.
        Data Min Knowl Disc (2019) 33: 995.
        https://doi.org/10.1007/s10618-019-00622-6
        """
        sampleSize = 1000
        #negSample = np.random.beta(aNeg, bNeg, sampleSize)
        #posSample = np.random.beta(aPos, bPos, sampleSize)
        #negPDF_neg = ss.beta.pdf(negSample,aNeg,bNeg)
        #posPDF_neg = ss.beta.pdf(negSample,aPos,bPos)
        #negPDF_pos = ss.beta.pdf(posSample,aNeg,bNeg)
        #posPDF_pos = ss.beta.pdf(posSample,aPos,bPos)
        posSample = np.random.multivariate_normal(mean=posMean, cov=cov,size=sampleSize)
        negSample = np.random.multivariate_normal(mean=negMean, cov=cov,size=sampleSize)
        negPDF_neg = ss.multivariate_normal.pdf(negSample,mean=negMean, cov=cov)
        posPDF_neg = ss.multivariate_normal.pdf(negSample,mean=posMean,cov=cov)
        negPDF_pos = ss.multivariate_normal.pdf(posSample,mean=negMean,cov=cov)
        posPDF_pos = ss.multivariate_normal.pdf(posSample,mean=posMean,cov=cov)
        z = np.zeros(sampleSize)
        pdfDiffPos_NEG, pdfDiffNeg_NEG, pdfMax_NEG = _yangHelper(negPDF_neg, posPDF_neg, z)
        pdfDiffPos_POS, pdfDiffNeg_POS, pdfMax_POS = _yangHelper(negPDF_pos, posPDF_pos, z)
        return _yH2(pdfDiffNeg_NEG, negPDF_neg, pdfDiffPos_POS, posPDF_pos, posPDF_neg, negPDF_pos, pdfMax_NEG, pdfMax_POS,p,sampleSize)

def _yangHelper(negPDF,posPDF,z):
        pdfDiff = negPDF - posPDF
        pdfDiffNeg = np.maximum(pdfDiff, z)
        minus1 = -1 * pdfDiff
        pdfDiffPos = np.maximum(minus1, z)
        pdfMax = np.maximum(negPDF, posPDF)
        return pdfDiffPos, pdfDiffNeg, pdfMax

def _yH2(pdfDiffNeg_NEG, negPDF_NEG, pdfDiffPos_POS, posPDF_POS, posPDF_NEG, negPDF_POS, pdfMax_NEG, pdfMax_POS,p,sampleSize):
        numerator1 = np.mean(pdfDiffNeg_NEG / negPDF_NEG)
        numerator2 = np.mean(pdfDiffPos_POS / posPDF_POS)
        sumVecs = np.power(numerator1, np.ones_like(numerator1) * p) + np.power(numerator2, np.ones_like(numerator2) * p)
        dPHat = np.power(sumVecs, np.ones_like(sumVecs) * (1/p))
        dTermNeg = (posPDF_NEG * 0.5) + (negPDF_NEG * 0.5)
        dTermPos = (posPDF_POS * 0.5) + (negPDF_POS * 0.5)
        denominator = (np.sum(pdfMax_NEG / dTermNeg) + np.sum(pdfMax_POS / dTermPos)) / (2 * sampleSize)
        return dPHat / denominator

In [None]:
yangDistributionDifference(np.array([1,2]),np.array([3,2]),np.eye(2),1)

In [None]:
for rep in tqdm(range(10),total=10,desc="reps"):
    # build dataset
    dsi = initDS(ds_size=25, n_alpha_ests=10)
    # Run gradient method
    maes = g2(dsi,n_epochs=100)
    # Run agglomerative clustering
    agg0 = AgglomerativeClustering(dsi, .5,use_alphas_as_scores=True)
    agg0.cluster()
    # plot results
    fig,ax = plt.subplots(1,5,figsize=(20,4))
    # Plot MAEs
    ax[0].plot(maes,label="gradient")
    maes2 =agg0.meanAbsErrs
    ax[0].plot(maes2, label="agg")
    globalMAE = np.mean(np.abs(dsi.trueAlphas - dsi.globalAlphaHats.mean()))
    ax[0].hlines(globalMAE, 0,100)
    ax[0].legend()
    ax[1].hist(dsi.trueAlphas)
    ax[1].set_title(r"$\alpha$")
    ax[2].hist(dsi.numP)
    ax[2].set_title("Num Positive")
    ax[3].hist(dsi.numU)
    ax[3].set_title("Num Unlabeled")
    ax[4].hist([h[:n].sum() for h,n in zip(dsi.hiddenLabels, dsi.numU)])
    ax[4].set_title("Num Unlabeled Positive")
    fig.suptitle("Distr Distance: {:.4f}    dim:{}".format(yangDistributionDifference(dsi.posDistMean,dsi.negDistMean,dsi.cov),
                                                          dsi.posDistMean.shape))
    plt.savefig("figs/nb_09/distrDistFigs/fig_{}.pdf".format(rep),format="pdf")
    plt.show()

# Diagnosis

After all, I wasn't using the same distribution for each data set but was sampling the dimension and the means of the mvn distributions randomly. The method performs well on the data sets in which the distributions are further away, but will lead to MAE values worse than that of the local estimates when the distributions have smaller distance.