We explore importance sampling on some trivial test cases.

The steps of importance sampling are described in the overleaf, Appendix D.

We will implement them first for the 1-dimensional z-test, then the 2- and 3-dimensional z-tests.

In [37]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas 
import seaborn 
import sklearn 
#import ipython
from sklearn.cluster import KMeans

In [30]:
mu = np.linspace(-2, 2, 11)
z = np.random.normal(0, 1, 1000)
data = mu[None,:] + z[:,None]

In [194]:
flat_data =  data.flatten()
#Now, we select the rejections in order to run k-means on them
selection = flat_data > 1.96
rejections = flat_data[selection]
standardized_rejections = (rejections - np.mean(rejections))/np.std(rejections)

In [206]:
n_clusters = 5
kmeans= KMeans(n_clusters=n_clusters,
               init = "random",
            n_init=10,
               max_iter=300,
               random_state = 42)

In [207]:
kmeans.fit(standardized_rejections.reshape(-1,1))

In [213]:
mu_cluster_centers = kmeans.cluster_centers_ * np.std(rejections) + np.mean(rejections)
mu_cluster_centers

array([[3.97053085],
       [2.46820867],
       [3.31656385],
       [2.11366402],
       [2.87094621]])

In [209]:
kmeans.labels_

array([3, 3, 1, ..., 1, 4, 3], dtype=int32)

In [210]:
n_orig = len(flat_data)
flat_labels = np.full(n_orig, -1,dtype = np.int32)
flat_labels[selection] = kmeans.labels_
np.unique(flat_labels, return_counts=True)

(array([-1,  0,  1,  2,  3,  4], dtype=int32),
 array([9602,   83,  383,  188,  459,  285]))

In [214]:
labels = flat_labels.reshape(data.shape)
n_sims_per_theta = data.shape[0] # 1000 for now
n_theta = data.shape[1] # 11 for now
target_fraction = np.full((n_clusters + 1, n_theta),-1)
labelset = np.unique(labels)
labelbins = np.append(labelset - 0.5,n_clusters - 0.5)
for i in range(n_theta):
    target_fraction[:,i] = np.histogram(labels[:,i], bins = labelbins)[0]

In [215]:
target_fraction
# looks good so far!

array([[1000, 1000, 1000,  999,  994,  971,  938,  865,  743,  630,  462],
       [   0,    0,    0,    0,    0,    0,    0,    1,    4,   27,   51],
       [   0,    0,    0,    0,    2,    6,   24,   46,   70,  113,  122],
       [   0,    0,    0,    0,    0,    0,    2,    7,   28,   51,  100],
       [   0,    0,    0,    1,    4,   21,   30,   57,  106,   99,  141],
       [   0,    0,    0,    0,    0,    2,    6,   24,   49,   80,  124]])

Now let's get into the business of doing the importance samples and re-weights

array([-2.        , -1.6       , -1.2       , -0.8       , -0.4       ,
        0.        ,  0.4       ,  0.8       ,  1.2       ,  1.6       ,
        2.        ,  3.97053085,  2.46820867,  3.31656385,  2.11366402,
        2.87094621])

In [272]:
# Now we construct the weights matrix: how many sims are we planning for each value of theta?
n_per_thetaj = 1000
# We want this to net out to, let's say, 1% of the total weight...
sum_rejects = np.delete(target_fraction, 0, axis = 0)
any_successes = np.sum(sum_rejects, axis = 0) > 0
any_successes
relevant_mu = mu[any_successes]

In [302]:
temp = sum_rejects / np.sum(sum_rejects,axis = 0)[None,:]
wjj = 0.01
important_weights = temp[:,any_successes]*(1 - wjj)
important_weights.shape

  temp = sum_rejects / np.sum(sum_rejects,axis = 0)[None,:]


(5, 8)

In [303]:
[relevant_mu.shape , important_weights.shape]

[(8,), (5, 8)]

In [304]:
np.diag(np.full_like(relevant_mu,wjj)).shape

(8, 8)

In [306]:
full_weights = np.append(np.diag(np.full_like(relevant_mu,wjj)), important_weights, axis = 0)
np.sum(full_weights, axis = 0)


array([1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
# Now we do the importance samples:
mu_importance = np.append(relevant_mu,mu_cluster_centers)
z_importance = np.random.normal(0, 1, size = 1000 * len(mu_importance)).reshape(1000, len(mu_importance))
data_importance = mu_importance[None,:] + z_importance

In [312]:
data_importance.shape, mu_importance.shape, relevant_mu.shape, np.transpose(full_weights).shape

((1000, 13), (13,), (8,), (8, 13))

In [418]:
#dimensions: i (simulations), j (initial theta), k (importance theta which generates samples), m (second dummy copy of importance theta)
inside_exponent = data_importance[:,None,:, None]*(mu_importance[None,None,None, :] - relevant_mu[None,:,None, None]) - mu_importance[None,None,None,:]**2/2 + relevant_mu[None,:,None, None]**2/2
likelihood_ratios = np.exp(inside_exponent) # I bet this is the problem! Look in this line and the above for a bug!
denoms = np.sum(likelihood_ratios * np.transpose(full_weights)[None, :, None, :], axis = 3)
rejects = data_importance > 1.96
inner_mean = np.mean(rejects[:,None,:]/denoms, axis = 0) #this is the inner sum divided by n_j
inner_mse_estimate = np.mean((rejects[:,None,:]/denoms)**2, axis = 0) - inner_mean**2 # trying to do an empirical calculation of the variance of each obs
final_result = np.sum(inner_mean * np.transpose(full_weights), axis = 1)
final_variance_estimate =np.sum((inner_mse_estimate/1000) * (np.transpose(full_weights)**2), axis = 1)

In [419]:
final_result

array([0.00279485, 0.00905081, 0.02472714, 0.05938342, 0.12325566,
       0.22408063, 0.36091008, 0.51831282])

In [420]:
final_variance_estimate

array([2.44363075e-08, 1.23098518e-07, 8.11344287e-07, 2.78396900e-06,
       8.33830844e-06, 1.97106349e-05, 3.27258136e-05, 4.35984712e-05])

In [422]:
#estimated sample size ratio
((final_result * (1-final_result)) / (1000)) /final_variance_estimate
# Hmm... this is not excellent. Importance sampling is doing better for the first few values of mu, but not consistently for larger values.
# I'm betting that this is due to an incorrect variance calculation


array([114.05337211,  72.85946643,  29.72315337,  20.06381023,
        12.95990717,   8.82105016,   7.04807515,   5.72645401])

In [423]:
mu_importance

array([-0.8       , -0.4       ,  0.        ,  0.4       ,  0.8       ,
        1.2       ,  1.6       ,  2.        ,  3.97053085,  2.46820867,
        3.31656385,  2.11366402,  2.87094621])

The denominator formula:

denom = sum w_jk Pk/Pj (X).

The likelihood ratio is exp([x -\ mu_j]^2/2 - [x-\mu_k]^2/2) = exp(-mu_k^2/2 + mu_j^2/2 + x(mu_k - mu_j))

Now let's generalize this to two-dimensional mu!

In [None]:
mu = np.linspace(-2, 2, 11)
z = np.random.normal(0, 1, 1000)
data = mu[None,:] + z[:,None]
flat_data =  data.flatten()
#Now, we select the rejections in order to run k-means on them
selection = flat_data > 1.96
rejections = flat_data[selection]
standardized_rejections = (rejections - np.mean(rejections))/np.std(rejections)
n_clusters = 5
kmeans= KMeans(n_clusters=n_clusters,
               init = "random",
            n_init=10,
               max_iter=300,
               random_state = 42)
kmeans.fit(standardized_rejections.reshape(-1,1))
mu_cluster_centers = kmeans.cluster_centers_ * np.std(rejections) + np.mean(rejections)
kmeans.labels_
n_orig = len(flat_data)
flat_labels = np.full(n_orig, -1,dtype = np.int32)
flat_labels[selection] = kmeans.labels_
labels = flat_labels.reshape(data.shape)
n_sims_per_theta = data.shape[0] # 1000 for now
n_theta = data.shape[1] # 11 for now
target_fraction = np.full((n_clusters + 1, n_theta),-1)
labelset = np.unique(labels)
labelbins = np.append(labelset - 0.5,n_clusters - 0.5)
for i in range(n_theta):
    target_fraction[:,i] = np.histogram(labels[:,i], bins = labelbins)[0]

In [None]:
#Pilot sims done, now the real run:
n_per_thetaj = 1000
# We want this to net out to, let's say, 1% of the total weight...
sum_rejects = np.delete(target_fraction, 0, axis = 0)
any_successes = np.sum(sum_rejects, axis = 0) > 0
any_successes
relevant_mu = mu[any_successes]
temp = sum_rejects / np.sum(sum_rejects,axis = 0)[None,:]
wjj = 0.01
important_weights = temp[:,any_successes]*(1 - wjj)
full_weights = np.append(np.diag(np.full_like(relevant_mu,wjj)), important_weights, axis = 0)
# Now we do the importance samples:
mu_importance = np.append(relevant_mu,mu_cluster_centers)
z_importance = np.random.normal(0, 1, size = 1000 * len(mu_importance)).reshape(1000, len(mu_importance))
data_importance = mu_importance[None,:] + z_importance
inside_exponent = data_importance[:,None,:, None]*(mu_importance[None,None,None, :] - relevant_mu[None,:,None, None]) - mu_importance[None,None,None,:]**2/2 + relevant_mu[None,:,None, None]**2/2
likelihood_ratios = np.exp(inside_exponent) # I bet this is the problem! Look in this line and the above for a bug!
denoms = np.sum(likelihood_ratios * np.transpose(full_weights)[None, :, None, :], axis = 3)
rejects = data_importance > 1.96
inner_mean = np.mean(rejects[:,None,:]/denoms, axis = 0) #this is the inner sum divided by n_j
inner_mse_estimate = np.mean((rejects[:,None,:]/denoms)**2, axis = 0) - inner_mean**2 # trying to do an empirical calculation of the variance of each obs
final_result = np.sum(inner_mean * np.transpose(full_weights), axis = 1)
final_variance_estimate =np.sum((inner_mse_estimate/1000) * (np.transpose(full_weights)**2), axis = 1)


In [None]:
final_result