In [1]:
import numpy as np

In [2]:
data = np.load("mcs_hw2_p3_data.npy")

In [3]:
x = data[:, :2]
y = data[:, 2]

In [62]:
import scipy.stats

def get_gradient_mu(beta, mu, sigma2):
    return (beta - mu) / sigma2

def get_gradient_logsigma2(beta, mu, sigma2):
    norm = np.linalg.norm(beta - mu)
    return (- 1 / sigma2 + norm * norm / (2 * sigma2 * sigma2)) * sigma2

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def get_log_p(x, y, beta):
    res = 0.0
    res += np.sum(y * np.log(sigmoid(np.dot(x, beta))) + (1.0 - y) * np.log(1 - sigmoid(np.dot(x, beta))))
    res += np.sum(scipy.stats.norm.logpdf(beta, np.zeros(2), np.ones(2)))
    return res

def get_log_q(mu, sigma2, beta):
    res = np.sum(scipy.stats.norm.logpdf(beta, mu, np.sqrt(sigma2)))
    return res

def elbo(x, y, mu, sigma2):
    sample_size = 1024
    sample_beta = np.random.normal(mu, np.sqrt(sigma2), size=[sample_size, mu.shape[0]])
    res = [get_log_p(x, y, beta) - get_log_q(mu, sigma2, beta) for beta in sample_beta]
    return np.mean(res)

In [63]:
def bbvi(x, y, mu, sigma2, lr, G):
    sample_size = 64
    sample_beta = np.random.normal(mu, np.sqrt(sigma2), size=[sample_size, mu.shape[0]])
    # update mu
    loss_mu = np.zeros(shape=[sample_size, mu.shape[0]])
    loss_logsigma2 = np.zeros(shape=[sample_size, sigma2.shape[0]])
    for i in range(sample_size):
        loss_mu[i] = get_gradient_mu(sample_beta[i], mu, sigma2)
        loss_logsigma2[i] = get_gradient_logsigma2(sample_beta[i], mu, sigma2)
        log_p = get_log_p(x, y, sample_beta[i])
        log_q = get_log_q(mu, sigma2, sample_beta[i])
        loss_mu[i] *= (log_p - log_q)
        loss_logsigma2[i] *= (log_p - log_q)
    update_mu = np.mean(loss_mu, axis=0)
    update_logsigma2 = np.mean(loss_logsigma2, axis=0)
    grad = np.concatenate([update_mu, update_logsigma2])
    G = G + np.outer(grad, grad)
    mu += lr * update_mu / np.sqrt(np.diag(G))[:2]
    sigma2 = np.exp(np.log(sigma2) + lr * update_logsigma2 / np.sqrt(np.diag(G))[2])
    #print(mu, sigma2)
    return mu, sigma2, G

In [70]:
mu_list = []
sigma2_list = []
def train_bbvi(x, y):
    mu = np.random.normal(size=2)
    sigma2 = np.power(np.random.normal(size=1), 2)
    G = np.zeros((3, 3))
    for i in range(100):
        lr = 0.999
        mu, sigma2, G = bbvi(x, y, mu, sigma2, lr, G)
        mu_list.append(mu)
        sigma2_list.append(sigma2)
        lr *= 0.999
        if i % 1 == 0:
            print(elbo(x, y, mu, sigma2))

In [71]:
train_bbvi(x, y)

-7914.464681971314
-11888.098262264499
-17874.90807013119
-11529.990719412446
-12548.522157623534
-10229.734433618294
-8848.571484386739
-6171.259417878
-5770.612504010439
-6003.299097947916
-6288.688322206181
-6132.6900154632285
-6292.14135804634
-6711.266873927639
-6018.223452223355
-6016.69961786796
-5737.745582414522
-5969.174692220148
-6076.602664398241
-5932.198831827467
-5423.698359145645
-5460.197938796147
-5394.659796172537
-5133.591059847637
-4840.829170692476
-4865.972341293588
-4924.149722128451
-4871.04161505294
-4966.236557296772
-5108.172535348707
-5014.316198847172
-4722.527614309291
-4902.1565171809825
-4747.137907336921
-4496.03570076779
-4470.940804740593
-4463.426990779433
-4545.479680793496
-4500.272903546032
-4476.857957619503
-4473.791739609776
-4537.388965929597
-4615.524034211722
-4621.146681580711
-4546.832131795823
-4616.781557490026
-4534.8083331210455
-4486.226086996594
-4523.077723647881
-4547.036064117856
-4791.483343084574
-4944.133286590464
-4721.799801

In [65]:
def bbvi_cv(x, y, mu, sigma2, lr, G):
    sample_size = 64
    sample_beta = np.random.normal(mu, np.sqrt(sigma2), size=[sample_size, mu.shape[0]])
    # update mu
    loss_mu = np.zeros(shape=[sample_size, mu.shape[0]])
    loss_logsigma2 = np.zeros(shape=[sample_size, sigma2.shape[0]])
    cv_mu = np.zeros(shape=[sample_size, mu.shape[0]])
    cv_sigma2 = np.zeros(shape=[sample_size, sigma2.shape[0]])
    for i in range(sample_size):
        loss_mu[i] = cv_mu[i] = get_gradient_mu(sample_beta[i], mu, sigma2)
        loss_logsigma2[i] = cv_sigma2[i] = get_gradient_logsigma2(sample_beta[i], mu, sigma2)
        log_p = get_log_p(x, y, sample_beta[i])
        log_q = get_log_q(mu, sigma2, sample_beta[i])
        loss_mu[i] *= (log_p - log_q)
        loss_logsigma2[i] *= (log_p - log_q)
        
    cov_mu0 = np.cov(np.stack((cv_mu.T[0], loss_mu.T[0]), axis=0))
    a_mu0 = cov_mu0[0][1] / cov_mu0[0][0]
    cov_mu1 = np.cov(np.stack((cv_mu.T[1], loss_mu.T[1]), axis=0))
    a_mu1 = cov_mu1[0][1] / cov_mu1[0][0]
    cov_logsigma2 = np.cov(np.stack((cv_sigma2.T[0], loss_logsigma2.T[0]), axis=0))
    a_logsigma2 = cov_logsigma2[0][1] / cov_logsigma2[0][0]
    
    update_mu = np.mean(loss_mu, axis=0)
    update_logsigma2 = np.mean(loss_logsigma2, axis=0)
    update_h_mu = np.mean(cv_mu, axis=0) * [a_mu0, a_mu1]
    update_h_logsigma2 = np.mean(cv_sigma2, axis=0) * a_logsigma2
    
    grad = np.concatenate([update_mu, update_logsigma2])
    G = G + np.outer(grad, grad)
    
    mu += lr * (update_mu - update_h_mu)/ np.sqrt(np.diag(G))[:2]
    sigma2 = np.exp(np.log(sigma2) + lr * (update_logsigma2 - update_h_logsigma2) / np.sqrt(np.diag(G))[2])
    #print(mu, sigma2)
    return mu, sigma2, G

In [68]:
def train_bbvi_cv(x, y):
    mu = np.random.normal(size=2)
    sigma2 = np.power(np.random.normal(size=1), 2)
    G = np.zeros((3, 3))
    for i in range(100):
        lr = 0.999
        mu, sigma2, G = bbvi_cv(x, y, mu, sigma2, lr, G)
        mu_list.append(mu)
        sigma2_list.append(sigma2)
        lr *= 0.999
        if i % 1 == 0:
            print(elbo(x, y, mu, sigma2))

In [69]:
train_bbvi_cv(x, y)

-4620.977433302092
-4610.896937119931
-4605.732559180638
-4602.323222369352
-4599.613044803253
-4594.996819330574
-4593.180568975669
-4591.499889575287
-4587.944841971397
-4585.201416833835
-4582.687303734738
-4579.850823561964
-4578.625435026414
-4577.057710228326
-4574.684179456468
-4573.385875983855
-4571.806772693046
-4570.5807903767945
-4569.19705388838
-4568.062251931045
-4566.856157533411
-4566.010349776893
-4564.4417086538215
-4564.033477976703
-4562.552104415649
-4561.358053165395
-4560.000549992327
-4559.079080571561
-4557.689834071993
-4556.86790525566
-4556.136157914234
-4554.817559483792
-4553.921536577547
-4552.94275254734
-4551.76295191855
-4550.954195520875
-4549.962661455607
-4549.199881172692
-4548.758614422217
-4547.744001569328
-4546.748557214141
-4546.371207385994
-4545.595623857969
-4544.648576013355
-4543.934046983752
-4543.315693023624
-4542.650365515125
-4542.335264294306
-4541.17862131822
-4541.001367801003
-4540.298822068027
-4539.75881692618
-4539.0132104658

In [118]:
def get_gradient_mu_rt(x, y, mu, sigma2, eps):
    beta = mu + eps * np.sqrt(sigma2)
    data_part = (y * (1 - sigmoid(np.dot(x, beta))))[:, None] * x 
    data_part += ((y - 1) * sigmoid(np.dot(x, beta)))[:, None] * x
    data_part = np.sum(data_part, axis=0)
    return data_part - beta

In [119]:
def get_gradient_logsigma2_rt(x, y, mu, sigma2, eps):
    res = 0.0
    beta = mu + eps * np.sqrt(sigma2)
    data_part = (y * (1 - sigmoid(np.dot(x, beta))))[:, None] * x 
    data_part += ((y - 1) * sigmoid(np.dot(x, beta)))[:, None] * x
    data_part = np.sum(data_part, axis=0)
    res += (data_part - beta) * eps
    res -= 1 / sigma2
    return res

In [120]:
def bbvi_rt(x, y, mu, sigma2, lr, G):
    sample_size = 64
    sample_eps = np.random.normal(size=[sample_size, mu.shape[0]])
    # update mu
    loss_mu = np.zeros(shape=[sample_size, mu.shape[0]])
    loss_logsigma2 = np.zeros(shape=[sample_size, sigma2.shape[0]])
    for i in range(sample_size):
        loss_mu[i] = get_gradient_mu_rt(x, y, mu, sigma2, sample_eps[i])
        loss_logsigma2[i] = get_gradient_logsigma2_rt(x, y, mu, sigma2, sample_eps[i])
    update_mu = np.mean(loss_mu, axis=0)
    update_logsigma2 = np.mean(loss_logsigma2, axis=0)
    grad = np.concatenate([update_mu, update_logsigma2])
    G = G + np.outer(grad, grad)
    mu += lr * update_mu / np.sqrt(np.diag(G))[:2]
    sigma2 = np.exp(np.log(sigma2) + lr * update_logsigma2 / np.sqrt(np.diag(G))[2])
    #print(mu, sigma2)
    return mu, sigma2, G

In [121]:
mu_list = []
sigma2_list = []
def train_bbvi_rt(x, y):
    mu = np.random.normal(size=2)
    sigma2 = np.power(np.random.normal(size=1), 2)
    G = np.zeros((3, 3))
    for i in range(100):
        lr = 0.999
        mu, sigma2, G = bbvi_rt(x, y, mu, sigma2, lr, G)
        mu_list.append(mu)
        sigma2_list.append(sigma2)
        lr *= 0.999
        if i % 1 == 0:
            print(elbo(x, y, mu, sigma2))

In [122]:
train_bbvi_rt(x, y)

ValueError: operands could not be broadcast together with shapes (10000,) (10000,2) 

In [92]:
y[: None].shape

(10000,)