In [110]:
%load_ext autoreload
%autoreload 2

In [7]:
import numpy as np
import cvxpy as cp
import multiprocessing
from multiprocessing import Pool
from functools import partial
from tqdm.notebook import tqdm
from choldate import cholupdate
from pycss.CSS import *
from pycss.subset_selection import *
from notebook_utils.utils import * 
from notebook_utils.data_generation import * 
from notebook_utils.missing_data import *

In [8]:
#def get_rel_error(W_hat, W):
#    return np.linalg.norm(W_hat - W)/np.linalg.norm(W)

#def get_regression_coeffs(Sigma, S):
#    return Sigma[complement(p, S), :][:, S] @ np.linalg.inv(Sigma[S, :][:, S])

def Sigma_MLE_from_MLE(MLE):
    
    noise = noise_from_MLE(MLE)
    p = np.sum(MLE['W_MLE'].shape)
    k = len(MLE['S_MLE'])
    
    Sigma_MLE = np.zeros((p, p))
    Sigma_MLE[:k, :k] = MLE['C_MLE'].copy()
    Sigma_MLE[k:, :k] = MLE['W_MLE'] @ MLE['C_MLE']
    Sigma_MLE[:k, k:] = Sigma_MLE[k:, :k].T.copy()
    Sigma_MLE[k:, k:] =  MLE['W_MLE'] @ MLE['C_MLE'] @ MLE['W_MLE'].T
    if noise == 'sph':
        np.fill_diagonal(Sigma_MLE[k:, k:], np.diag(Sigma_MLE[k:, k:]) + MLE['sigma_sq_MLE'])
    if noise == 'diag':
        np.fill_diagonal(Sigma_MLE[k:, k:], np.diag(Sigma_MLE[k:, k:]) + MLE['D_MLE'])
    perm_in_place(Sigma_MLE, np.concatenate([MLE['S_MLE'], complement(p, MLE['S_MLE'])]), np.arange(p))
    
    return Sigma_MLE

def get_test_reconstruction_error(Sigma, S_hat, W_hat, mu_hat, mu=None):
    if mu is None:
        mu = np.zeros(len(mu_hat))
    S_hat_comp = complement(p, S_hat)
    return np.trace(Sigma[S_hat_comp, :][:, S_hat_comp]) + np.trace(W_hat @ Sigma[S_hat, :][:, S_hat] @ W_hat.T) - 2*np.trace(W_hat @  Sigma[S_hat, :][:, S_hat_comp]) + np.sum(np.square(mu[S_hat_comp] - mu_hat[S_hat_comp] - W_hat @ (mu[S_hat] - mu_hat[S_hat])) )
    

# Synthetic Data Experiment

In [3]:
p = 50
k = 20
C_chol = get_equicorrelated_chol(k, 0.25, diag=1)
C = C_chol @ C_chol.T
sigma_sq = 0.2

W = get_block_W(p, k, num_blocks=4, block_size=8, overlap=4)

np.random.seed(0)
W *= np.random.choice(np.array([-1, 1]), W.shape)
    
signal_sizes = np.sum(W * (W @ C), axis=1)
W = np.sqrt(1 - sigma_sq)/np.sqrt(signal_sizes)[:, None] * W

B = 1000
n=60
num_inits=1
noise = 'sph'
q = 0.2

In [9]:
p = 20
k = 4
C_chol = get_equicorrelated_chol(k, 0.25, diag=1)
C = C_chol @ C_chol.T
sigma_sq = 0.2
    
W = get_block_W(p, k, num_blocks=3, block_size=2, overlap=1)

np.random.seed(0)
W *= np.random.choice(np.array([-1, 1]), W.shape)
    
signal_sizes = np.sum(W * (W @ C), axis=1)
W = np.sqrt(1 - sigma_sq)/np.sqrt(signal_sizes)[:, None] * W

MLE = {'C_MLE': C,
       'W_MLE': W,
       'sigma_sq_MLE': sigma_sq,
       'S_MLE': np.arange(k)}

Sigma = Sigma_MLE_from_MLE(MLE)

B = 100
n=50
num_inits=1
noise = 'sph'
q = 0.2

In [10]:
X = generate_gaussian_PCSS_data(n, W=W, C_chol=C_chol, sigma_sq=sigma_sq, B=B)
X[np.where(np.random.binomial(1, q, X.shape))] = np.nan
X_c = X - np.nanmean(X, axis=1)[:, np.newaxis, :]

### Mean Imputation

In [19]:
missing_data_search = partial(mean_imputation_css_with_missing_data, k=k, num_inits=num_inits)
with Pool(multiprocessing.cpu_count()) as pool:
    results = [result for result in tqdm(pool.imap_unordered(missing_data_search, list(X)))]
    

0it [00:00, ?it/s]

In [20]:
S_correct = 0 
test_error = np.zeros(B)

for i, result in enumerate(results):
    
    S = result[0]
    converged = result[1]
    
    if not converged:
        print("iteration " + str(i) + " did not converge.")
    
    if np.all(set(S) == set(np.arange(k))):
        S_correct += 1
    
    X_i = X[i, :, :]
    X_filled = np.where(np.isnan(X_i), np.nanmean(X_i, axis=0), X_i)
    mu_hat, Sigma_hat = get_moments(X_filled)
    coeff_hat = get_regression_coeffs(Sigma_hat, S)
    test_error[i] = get_test_reconstruction_error(Sigma, S, coeff_hat, mu_hat)
    
print(S_correct/B)
print(np.mean(test_error)/(sigma_sq * (p-k)))

0.07
1.9632332112868536


## EM Algorithm

In [21]:
missing_data_search = partial(css_with_missing_data, k=k, num_inits=num_inits)
with Pool(multiprocessing.cpu_count()) as pool:
    results = [result for result in tqdm(pool.imap_unordered(missing_data_search, list(X)))]
    

0it [00:00, ?it/s]

In [22]:
S_correct = 0 
test_error = np.zeros(B)

for i, result in enumerate(results):
    
    MLE = result[0]
    converged = result[1]
    
    if not converged:
        print("iteration " + str(i) + " did not converge.")
    
    if np.all(set(MLE['S_MLE']) == set(np.arange(k))):
        S_correct += 1
    
    Sigma_MLE = Sigma_MLE_from_MLE(MLE)
    coeff_hat = get_regression_coeffs(Sigma_MLE, MLE['S_MLE'])
    test_error[i] = get_test_reconstruction_error(Sigma, MLE['S_MLE'], coeff_hat, MLE['mu_MLE'])

print(S_correct/B)
print(np.mean(test_error)/(sigma_sq * (p-k)))

0.2
1.6537064412645144


## Block OMP

In [23]:
missing_data_search = partial(block_OMP_with_missing_data, k=k)
with Pool(multiprocessing.cpu_count()) as pool:
    results = [result for result in tqdm(pool.imap_unordered(missing_data_search, list(X_c)))]
    

0it [00:00, ?it/s]

In [24]:
S_correct = 0 
test_error = np.zeros(B)

for i, result in enumerate(results):
    S = result 
    
    if np.all(S == set(np.arange(k))):
        S_correct += 1
    
    X_i = X_c[i, :, :]
    w = np.ones((n, p))
    w[np.where(np.isnan(X_i))] = 0
    Y_tilde = np.nan_to_num(X_i, nan=0)
    vec_Y_tilde = [Y_tilde[:, j] for j in range(p)]
    
    A_I_pinv = [ np.linalg.pinv(w[:, j][:, None] * Y_tilde[:, result]) for j in range(p)]
    W_hat = [A_I_pinv[j] @ vec_Y_tilde[j] for j in complement(p, result)]
    W_hat = np.vstack(W_hat)
    test_error[i] = get_test_reconstruction_error(Sigma, result, W_hat, np.nanmean(X_i, axis=0))

print(S_correct/B)
print(np.mean(test_error)/(sigma_sq * (p-k)))

0.0
1.9500986427457951


## Group Lasso

In [12]:
import time

In [17]:
start = time.time()
missing_data_search = partial(group_lasso_with_missing_data, k=k, solver='SOCP')
with Pool(multiprocessing.cpu_count()) as pool:
    results = [result for result in tqdm(pool.imap_unordered(missing_data_search, list(X)))]
end = time.time()
print(end - start)

0it [00:00, ?it/s]

SolverError: The solver SOCP is not installed.

In [None]:
results = []
for i in tqdm(range(B)):
    results.append(group_lasso_with_missing_data(X[i, :, :], k))

In [None]:
S_correct = 0
test_error = np.zeros(B)

for i, result in enumerate(results):
    if np.all(set(result[0]) == set(np.arange(k))):
        S_correct += 1
    
    test_error[i] = get_test_reconstruction_error(Sigma, result, W_hat, np.nanmean(X_i, axis=0))

print(S_correct/B)
print(np.mean(test_error)/(sigma_sq * (p-k)))

# Real Data Experiments