In [None]:
# default_exp data/syntheticData

In [None]:
# export
import numpy as np

In [None]:
import scipy.stats as ss
import matplotlib.pyplot as plt

In [None]:
plt.plot(np.arange(1,101,1),ss.poisson.pmf(np.arange(0,100,1),25))
plt.title("num positive pmf")

In [None]:
plt.plot(np.arange(1,101,1), ss.poisson.pmf(np.arange(0,100,1),30))
plt.title("num unlabeled pmf")

In [None]:
plt.plot(np.arange(0,1,.01),ss.beta.pdf(np.arange(0,1,.01),2,10))
plt.title("alpha pdf")

In [None]:
# export

def getBag(nP=None, nU=None,posMean=None, negMean=None,cov=None,
           alphaDistr=lambda: np.random.beta(2,10)):
    if nP is None:
        nP = np.random.poisson(25) + 1
    if nU is None:
        nU = np.random.poisson(30) + 1
    alpha = alphaDistr()
    numUnlabeledPos = max(1,int(alpha * nU))
    numUnlabeledNeg = max(1,nU - numUnlabeledPos)
    # Sample Positive Points
    oneD = type(posMean) in [int, float]
    if oneD:
        ptsPos = np.random.normal(posMean,cov,size=nP).reshape((-1,1))
        ptsUnlabeled = np.concatenate([
                np.random.normal(posMean,
                                 cov,
                                 size=numUnlabeledPos),
                np.random.normal(negMean,
                                 cov,
                                 size=numUnlabeledNeg)
            ],axis=0).reshape((-1,1))
    else:
        ptsPos = np.random.multivariate_normal(posMean, cov,size=nP)
        ptsUnlabeled = np.concatenate([
                np.random.multivariate_normal(posMean,
                                              cov,
                                              size=numUnlabeledPos),
                np.random.multivariate_normal(negMean,
                                              cov,
                                              size=numUnlabeledNeg)
            ],axis=0)
    hiddenLabels = np.concatenate((np.ones(numUnlabeledPos),
                                   np.zeros(numUnlabeledNeg)))
    return {"positiveInstances":ptsPos,
            "unlabeledInstances": ptsUnlabeled,
            "alpha_i":alpha,
           "hiddenLabels": hiddenLabels,
           "posMean": posMean,
           "negMean": negMean,
           "cov": cov}, ptsPos.shape[0], ptsUnlabeled.shape[0]

In [None]:
plt.plot(np.arange(1,21),ss.poisson(.5).pmf(np.arange(20)))

In [None]:
# export
def buildDatasetDict(size,nP=None,nU=None,posMean=None, negMean=None,cov=None,
                     alphaDistr=lambda: np.random.beta(2,10),):
    if posMean is None:
        dim = np.random.poisson(1) + 1
        posMean = np.random.normal(loc=0,scale=3,size=dim)
        negMean = np.random.normal(loc=0,scale=3,size=dim)
        cov = np.eye(dim)
    bags, numPs, numUs = list(zip(*[getBag(nP=nP,nU=nU,posMean=posMean,negMean=negMean,cov=cov,alphaDistr=alphaDistr) for _ in range(size)]))
    maxP, maxU = np.max(numPs), np.max(numUs)
    d = bags[0]["positiveInstances"].shape[1]
    posMats = np.zeros((len(bags), maxP, d))
    unlabeledMats = np.zeros((len(bags), maxU, d))
    hiddenLabelMats = np.zeros((len(bags), maxU))
    alphas = np.zeros((len(bags), 1))
    numPos = np.zeros(len(bags),dtype=int)
    numU = np.zeros(len(bags),dtype=int)
    for bagNum,bag in enumerate(bags):
        posPadding = maxP - bag["positiveInstances"].shape[0]
        unlabeledPadding = maxU - bag["unlabeledInstances"].shape[0]
        dim = bag["positiveInstances"].shape[1]
        p_mat= np.concatenate((bag["positiveInstances"], np.zeros((posPadding, d))), axis=0)
        posMats[bagNum] = p_mat
        u_mat= np.concatenate((bag["unlabeledInstances"], np.zeros((unlabeledPadding, d))), axis=0)
        unlabeledMats[bagNum] = u_mat
        hiddenLabelMats[bagNum] = np.concatenate((bag["hiddenLabels"], np.zeros(unlabeledPadding)))
        alphas[bagNum] = bag["alpha_i"]
        numPos[bagNum] = bag["positiveInstances"].shape[0]
        numU[bagNum] = bag["unlabeledInstances"].shape[0]
    
    return {
        "positiveInstances": posMats,
        "unlabeledInstances": unlabeledMats,
        "alpha_i": alphas,
        "numP": numPos,
        "numU": numU,
        "hiddenLabels": hiddenLabelMats,
        "posMean": posMean,
        "negMean": negMean,
        "cov": cov
    }

In [None]:
np.concatenate((np.ones((4,3)),np.zeros((4,0))),axis=1)

In [None]:
# export
class Dataset:
    def __init__(self, d):
        self.positiveInstances = d["positiveInstances"]
        self.unlabeledInstances = d["unlabeledInstances"]
        self.trueAlphas = d["alpha_i"]
        self.N = self.positiveInstances.shape[0]
        self.numP = d["numP"]
        self.numU = d["numU"]
        self.hiddenLabels = d["hiddenLabels"]
        self.posDistMean = d["posMean"]
        self.negDistMean = d["negMean"]
        self.cov = d["cov"]
        
    def getBag(self,idx):
        p = self.positiveInstances[idx, :self.numP[idx]]
        u = self.unlabeledInstances[idx, :self.numU[idx]]
        return p,u

    def merge(self,ds2):
        # fix padding for positives
        d1=self.positiveInstances.shape[1]
        d2 = ds2.positiveInstances.shape[1]
        pad1 = max(d2,d1) - d1
        pad2 = max(d2,d1) - d2
        p1 = np.concatenate((self.positiveInstances,
                             np.zeros((self.positiveInstances.shape[0], pad1,self.positiveInstances.shape[2]))),
                           axis=1)
        p2 = np.concatenate((ds2.positiveInstances,
                             np.zeros((ds2.positiveInstances.shape[0], pad2, ds2.positiveInstances.shape[2]))),
                           axis=1)
        self.positiveInstances = np.concatenate((p1,p2))
        d1=self.unlabeledInstances.shape[1]
        d2 = ds2.unlabeledInstances.shape[1]
        pad1 = max(d2,d1) - d1
        pad2 = max(d2,d1) - d2
        u1 = np.concatenate((self.unlabeledInstances,
                             np.zeros((self.unlabeledInstances.shape[0], pad1,self.unlabeledInstances.shape[2]))),
                           axis=1)
        u2 = np.concatenate((ds2.unlabeledInstances,
                             np.zeros((ds2.unlabeledInstances.shape[0], pad2, ds2.unlabeledInstances.shape[2]))),
                           axis=1)
        self.unlabeledInstances = np.concatenate((u1,u2))
        self.N += ds2.N
        self.numP = np.concatenate((self.numP,ds2.numP))
        self.numU = np.concatenate((self.numU,ds2.numU))
        self.posDistMean = np.concatenate((np.array([self.posDistMean]),
                                           np.array([ds2.posDistMean])))
        
        self.negDistMean = np.concatenate((np.array([self.negDistMean]),
                                           np.array([ds2.negDistMean])))
        self.cov = np.concatenate((np.array([self.cov]),
                                   np.array([ds2.cov])))
        self.trueAlphas = np.concatenate((self.trueAlphas, ds2.trueAlphas))

In [None]:
# export
def buildDataset(size,nP=None,nU=None,posMean=None, negMean=None,cov=None,alphaDistr=lambda: np.random.beta(2,10)):
    ddict = buildDatasetDict(size,nP=nP, nU=nU, posMean=posMean, negMean=negMean, cov=cov, alphaDistr=alphaDistr)
    return Dataset(ddict)

In [None]:
# hide
d = buildDataset(2, alphaDistr=lambda: np.random.beta(2,2),posMean=[1,2],negMean=[2,2],cov=np.eye(2))
d2 = buildDataset(3, alphaDistr=lambda: np.random.beta(2,2),posMean=[1,2],negMean=[2,2],cov=np.eye(2))

In [None]:
d.trueAlphas, d2.trueAlphas

In [None]:
d.merge(d2)

In [None]:
d.trueAlphas

In [None]:
d.positiveInstances.shape