In [1]:
import numpy as np
from pycss.utils import *
from pycss.subset_selection import *
from pycss.CSS import *
import warnings
import tqdm
import itertools

In [5]:
X = np.random.normal(0, 1, (p, p))
X -= np.mean(X, axis=0)
np.mean(X, axis=0)


array([ 2.22044605e-17, -8.32667268e-18, -2.49800181e-17,  3.33066907e-17,
       -3.33066907e-17,  2.22044605e-17,  1.52655666e-17, -1.11022302e-17,
       -2.22044605e-17,  4.44089210e-17,  2.22044605e-17,  6.93889390e-18,
       -7.77156117e-17,  3.33066907e-17, -5.55111512e-18, -4.99600361e-17,
        2.77555756e-17, -2.22044605e-17, -2.22044605e-17,  2.22044605e-17])

In [4]:
css =CSS()
css.select_subset_from_data(X, center=False, k=4, method='exhaustive', standardize=False, exclude=np.array([19]), include=np.array([0]), show_progress=False)
print(css.include)
print(css.exclude)
print(css.S)
print(np.trace(css.Sigma_R))
print(np.diag(css.Sigma_R))
print(len(css.S))

[0]
[19]
[ 0 16 17 18]
155.00000000000003
[ 0.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.  0.  0.
  0. 20.]
4


In [5]:
np.diag(css.Sigma_R)

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -6.66133815e-16])

array([17, 11])

In [62]:
np.concatenate([np.arange(p-k), np.zeros(k)])

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15.,  0.,  0.,  0.,  0.])

In [28]:
check_greedy_css_inputs(np.diag(np.arange(10)), 
                        k=3, 
                        cutoffs=None, 
                        include=np.array([]), 
                        exclude = np.array([]), 
                        tol=1e-10)

Include must be a numpy array of integers from 0 to p-1.
Exclude must be a numpy array of integers from 0 to p-1.


In [57]:
def exhaustive_css(Sigma, 
                   k, 
                   include=np.array([]),
                   exclude=np.array([]),
                   show_progress=True):
    p = Sigma.shape[0]

    best_S = None
    best_Sigma_R = None
    best_obj_val = np.inf 

    options = np.array([idx for idx in np.arange(p) if idx not in np.concatenate([include, exclude])])
    to_add = k - len(include)
    S = np.concatenate([include, -1*np.ones(to_add)]).astype(int)

    if show_progress:
        print("Iterating over " + str(math.comb(len(options), to_add)) + " different subsets...")
        iterator = tqdm.tqdm(itertools.combinations(options, to_add))
    else:
        iterator = itertools.combinations(options, to_add)

    for  remaining in iterator:
        S[len(include):] = np.array(remaining).astype(int)
        Sigma_R = regress_off(Sigma, S)
        obj_val = np.trace(Sigma_R)
        if obj_val < best_obj_val:
            best_obj_val = obj_val
            best_S = S
            best_Sigma_R = Sigma_R
    
    return best_S, best_Sigma_R 
    


In [19]:
exhaustive_css(np.diag(np.arange(22)), k=10, include=np.array([0,4]), exclude=np.array([20, 13, 16]), show_progress=False)


#MAKE SURE S_INIT AND INCLUDE EXCLUDE ARE INTEGER TYPE 

(array([ 0,  4, 11, 12, 14, 15, 17, 18, 19, 21]),
 array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  6.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  7.

In [455]:
TOL =1e-8

In [456]:
def css_score(Sigma_R, tol=TOL):
    diag = np.diag(Sigma_R)
    return -1 * np.divide(np.sum(np.square(Sigma_R), axis=1), diag, out=np.zeros_like(diag, dtype=float), where=(diag > tol))

def check_greedy_css_inputs(Sigma, k, cutoffs, include, exclude, tol):
    n, p = Sigma.shape 

    if not n == p:
        raise ValueError("Sigma must be a square matrix.")
  
    if k is None and cutoffs is None:
        raise ValueError("One of k or cutoff must not be None.")
  
    if k is not None and cutoffs is not None:
        raise ValueError("Only one of k or cutoff can be None.")

    if cutoffs is not None:
        if (isinstance(cutoffs, (list, np.ndarray)) and not len(cutoffs) == p) or not isinstance(cutoffs, (int, np.integer, float)):
            raise ValueError("Cutoffs must be a single value or length p.")

    if k is not None and not isinstance(k, (int, np.integer)):
        raise ValueError("k must be an integer.")
    if k is not None and (k <= 0 or k > p):
        raise ValueError("k must be > 0 and <= p.")

    set_include = set(include)
    set_exclude = set(exclude)
    if not set_include.issubset(np.arange(p)):
        raise ValueError("Exclude must be a subset of the available indices.")
    if not set_exclude.issubset(np.arange(p)):
        raise ValueError("Include must be a subset of the available indices.")
    if len(set_exclude.intersection(set_include)) > 0:
        raise ValueError("Include and exclude must be disjoint.")
        
    if len(exclude) == p:
        raise ValueError("Cannot exclude everything.")
    if k is not None and len(include) > k:
        raise ValueError("Cannot include more than k.")
    if k is not None and len(exclude) > p - k:
        raise ValueError("Cannot exclude more than p-k.")


    if not isinstance(tol, float):
        raise ValueError("tol must be a float.")

    return

def greedy_css(Sigma,
               k=None,
               cutoffs=None,
               include=np.array([]),
               exclude=np.array([]),
               tol=TOL,
               ):

    check_greedy_css_inputs(Sigma, k, cutoffs, include, exclude, tol)

    Sigma_R = Sigma.copy()
    p = Sigma.shape[0]
    S = -1 * np.ones(p).astype(int)

    if isinstance(cutoffs, (int, np.integer, float)):
        cutoffs = cutoffs * np.ones(p)

    idx_order = np.arange(p)
    num_active = p

    selected_enough = False
    num_selected = 0

    while not selected_enough:

        # subset to acvice variables
        Sigma_R_active = Sigma_R[:num_active, :num_active]

        if num_selected < len(include):
            j_star = np.where(idx_order == include[num_selected])[0][0]

            # If the include variables are colinear return a colinearity error
            if j_star > num_active - 1:
                warnings.warn("Variables " + str(include[:num_selected + 1]) + " that have been requested to be included are colinear.")
                S[num_selected] = idx_order[j_star]
                num_selected += 1
                continue

        else:
            # compute objective values
            obj_vals = css_score(Sigma_R_active, tol=tol)

            # set the exclude objective values to infinity
            obj_vals[np.in1d(idx_order[:num_active], exclude)] = np.inf
            # select next variable
            j_star = random_argmin(obj_vals)

        S[num_selected] = idx_order[j_star]
        num_selected += 1

        # regress off selected variable
        regress_one_off_in_place(Sigma_R_active, j_star, tol=tol)

        # swap selected variable with last active position
        swap_in_place(Sigma_R, [j_star], [num_active - 1], idx_order=idx_order)
        # decrement number active
        num_active -= 1

        # swap any variables with < tol variance to bottom and update num active
        zero_idxs = np.where(np.diag(Sigma_R_active)[:num_active] < tol)[0]
        num_zero_idxs = len(zero_idxs)
        idxs_to_swap = np.arange(num_active - num_zero_idxs, num_active)
        swap_in_place(Sigma_R, zero_idxs, idxs_to_swap, idx_order=idx_order)
        num_active -= num_zero_idxs

        # continue if not enough included
        if num_selected < len(include):
            continue
        # terminate early if all variables are explained
        if set(idx_order[:num_active]).issubset(exclude):
            selected_enough = True
        # terminate if user requested k and k have been selected
        if k is not None and num_selected == k:
            selected_enough = True
        # terminate if below user's cutoff
        if cutoffs is not None and np.trace(Sigma_R) <= cutoffs[num_selected - 1]:
            selected_enough = True

    perm_in_place(Sigma_R, np.arange(p), np.argsort(idx_order))

    return S[:num_selected], Sigma_R


In [457]:
def subset_factor_score(Sigma_R, tol=TOL):

    diag = np.diag(Sigma_R)
    resids = diag - (1/diag)[:, None] * np.square(Sigma_R)
    np.fill_diagonal(resids, 1)

    if np.any(resids < tol):
        return None, np.where(resids < tol)
    else:
        objective_values = np.log(diag) + np.sum(np.log(resids), axis=1)
        return objective_values, (np.array([]), np.array([]))

def check_greedy_subset_factor_inputs(Sigma, cutoffs, include, exclude, tol):
    n, p = Sigma.shape 

    if not n == p:
        raise ValueError("Sigma must be a square matrix.")
  
    if not isinstance(cutoffs, (list, np.ndarray)) or not len(cutoffs) == p + 1:
        raise ValueError("Must provide p + 1 cutoffs.")

    set_include = set(include)
    set_exclude = set(exclude)
    if not isinstance(include, np.ndarray) or (include.dtype != 'int' and len(include) > 0) or not set_include.issubset(np.arange(p)): 
        raise ValueError('Include must be a numpy array of integers from 0 to p-1.')
    if not isinstance(exclude, np.ndarray) or (exclude.dtype != 'int' and len(exclude) > 0) or not set_exclude.issubset(np.arange(p)):
        raise ValueError('Exclude must be a numpy array of integers from 0 to p-1.')
    if len(set_exclude.intersection(set_include)) > 0:
        raise ValueError("Include and exclude must be disjoint.")

    if not isinstance(tol, float):
        raise ValueError("tol must be a float.")

    return

def greedy_subset_factor_selection(Sigma,
                                   cutoffs,
                                   include=np.array([]),
                                   exclude=np.array([]),
                                   tol=TOL,
                                   ):

    check_greedy_subset_factor_inputs(Sigma, cutoffs, include, exclude, tol)
    
    Sigma_R = Sigma.copy()
    p = Sigma.shape[0]
    
    # check if size-0 subset is sufficient 
    reject = np.sum(np.log(np.diag(Sigma_R))) > cutoffs[0]
    if (not reject and len(include) == 0) or len(exclude) == p:
        return np.array([]), reject
        
    
    S = -1 * np.ones(p).astype(int)
    idx_order = np.arange(p)
    num_active = p

    selected_enough = False
    num_selected = 0

    running_residuals = -1 * np.ones(p)

    # subset to acvice variables
    Sigma_R_active = Sigma_R[:num_active, :num_active]
    while not selected_enough:

        if num_selected < len(include):
            j_star = np.where(idx_order == include[num_selected])[0][0]
        else:
            # compute objective values
            obj_vals, colinearity_errors = subset_factor_score(Sigma_R_active, tol=tol)

            # if adding a variable results in zero residual, warn the user and fail to reject
            if len(colinearity_errors[0]) > 0:
                warnings.warn("When you add variable "  + str(colinearity_errors[0]) + " to " + str(S[:num_selected]) + " it perfectly explains variable " + str(colinearity_errors[1]) + ".")
                reject = False
                return np.concatenate([S[:num_selected], np.array(colinearity_errors[0][1])]), reject

            # set the exclude objective values to infinity
            obj_vals[np.in1d(idx_order[:num_active], exclude)] = np.inf
            # select next variable
            j_star = random_argmin(obj_vals)

        S[num_selected] = idx_order[j_star]
        running_residuals[num_selected] = Sigma_R_active[j_star, j_star]
        num_selected += 1

        # regress off selected variable
        regress_one_off_in_place(Sigma_R_active, j_star, tol=tol)

        # swap selected variable with last active position
        swap_in_place(Sigma_R, [j_star], [num_active - 1], idx_order=idx_order)
        # decrement number active
        num_active -= 1

        # subset_to_active_variables
        Sigma_R_active = Sigma_R[:num_active, :num_active]

        # when including variables that have been requested to be included, ensure that no residuals are zero
        if num_selected <= len(include):
            zeros = np.where(np.diag(Sigma_R_active) < tol)[0]
            if len(zeros) > 0:
                warnings.warn("Variables " + str(S[:num_selected]) + " perfectly explain " + str(idx_order[zeros]) + ".")
                reject = False
                return include, reject

        # continue if not enough included
        if num_selected < len(include):
            continue

        # if we fail to reject, terminate and return
        if np.sum(np.log(np.diag(Sigma_R_active))) + np.sum(np.log(running_residuals[:num_selected])) <= cutoffs[num_selected]:
            reject = False
            selected_enough = True

        # forcefully fail to reject once we've selected at least p-1 in case of numerical instability
        if num_selected >= p - 1:
            reject = False
            selected_enough = True

        # terminate if no variables are left to select
        if set(idx_order[:num_active]).issubset(exclude):
            selected_enough = True

    return S[:num_selected], reject

In [458]:
def check_swapping_css_inputs(Sigma,
                              k,
                              max_iter,
                              num_inits, 
                              S_init,
                              include,
                              exclude,
                              tol):
    n, p = Sigma.shape 

    if not n == p:
        raise ValueError("Sigma must be a square matrix.")

    if not isinstance(k, (int, np.integer)) or k <= 0 or k > p:
        raise ValueError("k must be an integer > 0 and <= p.")
    
    if S_init is not None:
        if not isinstance(S_init, np.ndarray) or  len(set(S_init)) != k or (not set(S_init).issubset(np.arange(p))):
            raise ValueError("S_init must be a numpy array of length k containing indices 0 to p-1.")
        if not set(include).issubset(S_init):
            raise ValueError("Include must be a subset of S_init.")
        if len(set(exclude).intersection(S_init)) > 0:
            raise ValueError("S_init cannot contain any elements in exlcude.")
        
    set_include = set(include)
    set_exclude = set(exclude)
    if not set_include.issubset(np.arange(p)):
        raise ValueError("Exclude must be a subset of the available indices.")
    if not set_exclude.issubset(np.arange(p)):
        raise ValueError("Include must be a subset of the available indices.")
    if len(set_exclude.intersection(set_include)) > 0:
        raise ValueError("Include and exclude must be disjoint.")

    if len(include) > k:
        raise ValueError("Cannot include more than k.")
    if k is not None and len(exclude) > p - k:
        raise ValueError("Cannot exclude more than p-k.")


    if not isinstance(tol, float):
        raise ValueError("tol must be a float.")

def swapping_css_with_init(Sigma,
                           S_init,
                           max_iter,
                           include,
                           exclude,
                           tol=TOL):
    k = len(S_init)
    p = Sigma.shape[0]
    d = p-k
    include_set = set(include)

    idx_order = np.arange(p)

    Sigma_R = Sigma.copy()
    # these will always be the indices of the selected subset
    subset_idxs = np.arange(d, p)
    # swap initial variables to bottom of Sigma
    swap_in_place(Sigma_R, subset_idxs, S_init, idx_order=idx_order)
    S = idx_order[d:].copy()
    Sigma_S = Sigma[:, S][S, :].copy()
    invertible, Sigma_S_L = is_invertible(Sigma_S)   

    if not invertible:
        return None, None, None 

    regress_off_in_place(Sigma_R, np.arange(d, p))

    # number of completed iterations
    N = 0
    # counter of how many consecutive times we have chose not to swap 
    not_replaced = 0
    # permutation which shifts the last variable in the subset to the top of the subset
    subset_idxs_permuted = np.concatenate([subset_idxs[1:], np.array([subset_idxs[0]])])
    converged = False

    while N < max_iter and (not converged):
        for i in range(k):
            S_0 = S[0]

            # Update cholesky after removing first variable from subset
            Sigma_T_L = update_cholesky_after_removing_first(Sigma_S_L)

            if S_0 not in include_set:
            
                # Subest with first variable removed  from selected subset
                T = S[1:]

                # Update residual covariance after removing first variable from subset
                v = Sigma[:, S_0] - Sigma[:, T] @ solve_with_cholesky(Sigma_T_L, Sigma[T, S_0]) if k > 1 else Sigma[:, S_0]
                reordered_v = v[idx_order]
                Sigma_R = Sigma_R + np.outer(reordered_v, reordered_v)/v[S_0]
                
                # Swap first variable from subset to to top of residual matrix
                swap_in_place(Sigma_R, np.array([0]), np.array([d]), idx_order=idx_order)

                # find indices of variables with zero variance
                zero_idxs = np.where(np.diag(Sigma_R)[:(d + 1)] <= tol)[0]
                num_zero_idxs = len(zero_idxs)
                # In residual matrix, swap variables with zero indices to right above currently selected subset (of size k-1)
                swap_in_place(Sigma_R, zero_idxs, np.arange(d + 1 - num_zero_idxs, d + 1), idx_order=idx_order)
                
                # update num_active
                num_active = d + 1 - num_zero_idxs

                # compute objectives and for active variables and find minimizers
                obj_vals = css_score(Sigma_R[:num_active, :num_active], tol=tol)

                # set the objective value to infinity for the excluded variables
                obj_vals[np.in1d(idx_order[:num_active], exclude)] = np.inf

                choices = np.flatnonzero(obj_vals == obj_vals.min())

                # if removed variable is a choice, select it, otherwise select a random choice
                if 0 in choices:
                    not_replaced += 1
                    j_star = 0
                else:
                    not_replaced = 0
                    j_star = np.random.choice(choices)
                
                S_new = idx_order[j_star]
                
                # In residual covariance, regress selected variable off the remaining
                #regress_one_off_in_place(Sigma_R[:(d+1), :(d+1)], j_star) #alternative option
                regress_one_off_in_place(Sigma_R[:num_active, :num_active], j_star)
                # In residual covariance swap new choice to top of selected subset 
                swap_in_place(Sigma_R, np.array([j_star]), np.array([d]), idx_order=idx_order)
              
            else:
                S_new = S_0 
            
            # Add new choice as the last variable in selected subset
            S[:k-1] = S[1:]
            S[k-1] = S_new
            # Update cholesky after adding new choice as last variable in selected subset
            Sigma_S_L = update_cholesky_after_adding_last(Sigma_T_L, Sigma[S_new, S])
            
            # permute first variables in selected subset to the last variable in the residual matrix
            perm_in_place(Sigma_R, subset_idxs,  subset_idxs_permuted, idx_order=idx_order)

            if not_replaced == k - len(include):
                converged=True
                break

        N += 1

    perm_in_place(Sigma_R, np.arange(p), np.argsort(idx_order))
    
    return S, Sigma_R, converged 

def swapping_css(Sigma,
                 k,
                 max_iter=100,
                 num_inits=1, 
                 S_init=None,
                 include=np.array([]),
                 exclude=np.array([]),
                 tol=TOL):

    check_swapping_css_inputs(Sigma,
                              k,
                              max_iter,
                              num_inits, 
                              S_init,
                              include,
                              exclude,
                              tol)
    
    best_converged = None
    best_S = None
    best_S_init = None 
    best_Sigma_R = None
    best_obj_val = np.inf 
    not_include = np.array([idx for idx in complement(Sigma.shape[0], include) if idx not in set(exclude)])
    
    if len(include) > 0:
        invertible, _ = is_invertible(Sigma[include, :][:, include], tol=tol)
        if not invertible:
            warnings.warn("The variables requested to be included are colinear.")
            return best_S, best_Sigma_R, best_S_init, best_converged   
    
    no_initialization = (S_init is None)
    if not no_initialization:
        num_inits = 1

    for _ in range(num_inits):
        if no_initialization:
            S_init = np.concatenate([include, np.random.choice(not_include, k-len(include), replace=False)]).astype(int)

        S, Sigma_R, converged  = swapping_css_with_init(Sigma=Sigma,
                                                        S_init=S_init,
                                                        max_iter=max_iter, 
                                                        include=include,
                                                        exclude=exclude,
                                                        tol=TOL)
        if S is None:
            continue 
      
        obj_val = np.trace(Sigma_R)
        if obj_val < best_obj_val:
            best_obj_val = obj_val 
            best_S = S
            best_S_init = S_init
            best_Sigma_R = Sigma_R
            best_converged = converged 

    if best_S is None:
        warnings.warn("All the initializations tried were colinear.")
    return best_S, best_Sigma_R, best_S_init, best_converged

In [459]:
def check_swapping_subest_factor_inputs(Sigma,
                                        k,
                                        cutoff, 
                                        max_iter,
                                        num_inits, 
                                        S_init,
                                        find_minimizer,
                                        include,
                                        exclude,
                                        tol):
    n, p = Sigma.shape 

    if not n == p:
        raise ValueError("Sigma must be a square matrix.")

    if not isinstance(k, (int, np.integer)) or k < 0 or k > p:
        raise ValueError("k must be an integer between 0 and p (inclusive).")
    
    set_include = set(include)
    set_exclude = set(exclude)
    if not isinstance(include, np.ndarray) or (include.dtype != 'int' and len(include) > 0) or not set_include.issubset(np.arange(p)): 
        raise ValueError('Include must be a numpy array of integers from 0 to p-1.')
    if not isinstance(exclude, np.ndarray) or (exclude.dtype != 'int' and len(exclude) > 0) or not set_exclude.issubset(np.arange(p)):
        raise ValueError('Exclude must be a numpy array of integers from 0 to p-1.')
    if len(set_exclude.intersection(set_include)) > 0:
        raise ValueError("Include and exclude must be disjoint.")
    
    if S_init is not None:
        if not isinstance(S_init, np.ndarray) or S_init.dtype != 'int' or len(set(S_init)) != k or (not set(S_init).issubset(np.arange(p))):
            raise ValueError("S_init must be a numpy array of k integers from 0 to p-1 inclusive.")
        if not set(include).issubset(S_init):
            raise ValueError("Include must be a subset of S_init.")
        if len(set(exclude).intersection(S_init)) > 0:
            raise ValueError("S_init cannot contain any elements in exlcude.")
        

    if len(include) > k:
        raise ValueError("Cannot include more than k.")
    if k is not None and len(exclude) > p - k:
        raise ValueError("Cannot exclude more than p-k.")


    if not isinstance(tol, float):
        raise ValueError("tol must be a float.")
    
    return 

def swapping_subset_factor_with_init(Sigma, 
                                     S_init,
                                     find_minimizer,
                                     cutoff, 
                                     max_iter,
                                     include,
                                     exclude,
                                     tol=TOL):
    
    
    k = len(S_init)
    
    # handle case where subset must be empty 
    if k == 0:
        log_det = np.sum(np.log(np.diag(Sigma)))
        reject = log_det > cutoff
        return np.array([]), reject, log_det 
    
    p = Sigma.shape[0]
    d = p-k
    include_set = set(include)

    idx_order = np.arange(p)

    Sigma_R = Sigma.copy()
    # these will always be the indices of the selected subset
    subset_idxs = np.arange(d, p)
    # swap initial variables to bottom of Sigma
    swap_in_place(Sigma_R, subset_idxs, S_init, idx_order=idx_order)
    S = idx_order[d:].copy()
    Sigma_S = Sigma[:, S][S, :].copy()
    invertible, Sigma_S_L = is_invertible(Sigma_S)   

    if not invertible:
        warnings.warn("Variables " + str(S_init) + " are colinear." )
        reject = False
        return S_init, reject, -np.inf  

    regress_off_in_place(Sigma_R, np.arange(d, p))
    
    where_zeros = np.where(np.diag(Sigma_R)[:d] < tol)[0]
    if len(where_zeros > 0):
        warnings.warn("Variables " + str(S_init) + " perfectly explain " + str(idx_order[where_zeros]) )
        reject = False 
        return S_init, reject, -np.inf 
    

    # number of completed iterations
    N = 0
    # counter of how many consecutive times we have chose not to swap 
    not_replaced = 0
    # permutation which shifts the last variable in the subset to the top of the subset
    subset_idxs_permuted = np.concatenate([subset_idxs[1:], np.array([subset_idxs[0]])])
    converged = False

    while N < max_iter and (not converged):
        for i in range(k):
            S_0 = S[0]

            # Update cholesky after removing first variable from subset
            Sigma_T_L = update_cholesky_after_removing_first(Sigma_S_L)

            if S_0 not in include_set:
            
                # Subest with first variable removed from selected subset
                T = S[1:]

                # Update residual covariance after removing first variable from subset
                v = Sigma[:, S_0] - Sigma[:, T] @ solve_with_cholesky(Sigma_T_L, Sigma[T, S_0]) if k > 1 else Sigma[:, S_0]
                reordered_v = v[idx_order]
                Sigma_R = Sigma_R + np.outer(reordered_v, reordered_v)/v[S_0]
                
                # Swap first variable from subset to to top of residual matrix
                swap_in_place(Sigma_R, np.array([0]), np.array([d]), idx_order=idx_order)
                
                # compute objectives and for active variables and find minimizers
                obj_vals, colinearity_errors = subset_factor_score(Sigma_R[:(d+1), :(d+1)], tol=tol)

                # if adding a variable results in zero residual, warn the user and fail to reject
                if len(colinearity_errors[0]) > 0:
                    warnings.warn("When you add variable "  + str(colinearity_errors[0]) + " to " + str(S[:num_selected]) + " it perfectly explains variable " + str(colinearity_errors[1]) + ".")
                    reject = False
                    return np.concatenate([T, np.array(colinearity_errors[0][1])]), reject, -np.inf

                # set the objective value to infinity for the excluded variables
                obj_vals[np.in1d(idx_order[:(d+1)], exclude)] = np.inf

                choices = np.flatnonzero(obj_vals == obj_vals.min())

                # if removed variable is a choice, select it, otherwise select a random choice
                if 0 in choices:
                    not_replaced += 1
                    j_star = 0
                else:
                    not_replaced = 0
                    j_star = np.random.choice(choices)
                
                S_new = idx_order[j_star]
                
                # In residual covariance, regress selected variable off the remaining
                regress_one_off_in_place(Sigma_R[:(d+1), :(d+1)], j_star)
                # In residual covariance swap new choice to top of selected subset 
                swap_in_place(Sigma_R, np.array([j_star]), np.array([d]), idx_order=idx_order)
              
            else:
                S_new = S_0 
            
            # Add new choice as the last variable in selected subset
            S[:k-1] = S[1:]
            S[k-1] = S_new
            # Update cholesky after adding new choice as last variable in selected subset
            Sigma_S_L = update_cholesky_after_adding_last(Sigma_T_L, Sigma[S_new, S])
            
            # permute first variables in selected subset to the last variable in the residual matrix
            perm_in_place(Sigma_R, subset_idxs,  subset_idxs_permuted, idx_order=idx_order)
            
            # If you don't want to find the minimizer and log det is small enough, terminate now
            if not find_minimizer:
                log_det = np.sum(np.log(np.diag(Sigma_R)[:d])) + np.sum(np.log(np.square(np.diag(Sigma_S_L))))
                if log_det <= cutoff:
                    reject = False
                    return S, reject, log_det

            if not_replaced == k - len(include):
                converged=True
                break

        N += 1

    log_det = np.sum(np.log(np.diag(Sigma_R)[:d])) + np.sum(np.log(np.square(np.diag(Sigma_S_L))))
    reject = (log_det > cutoff)
    return S, reject, log_det

def swapping_subset_factor_selection(Sigma,
                                     k,
                                     cutoff,
                                     max_iter=100,
                                     num_inits=1, 
                                     S_init=None,
                                     find_minimizer=False, 
                                     include=np.array([]),
                                     exclude=np.array([]),
                                     tol=TOL):
    
    check_swapping_subest_factor_inputs(Sigma,
                                        k,
                                        cutoff, 
                                        max_iter,
                                        num_inits, 
                                        S_init,
                                        find_minimizer,
                                        include,
                                        exclude,
                                        tol)
    
    reject = True
    best_S = None
    best_log_det = np.inf 
    not_include = np.array([idx for idx in complement(Sigma.shape[0], include) if idx not in set(exclude)])
    
    if len(include) > 0:
        invertible, _ = is_invertible(Sigma[include, :][:, include], tol=tol)
        if not invertible:
            warnings.warn("The variables that have been requested to be included are colinear.")
            reject = False
            return np.concatenate([include, not_include[:(k - len(incude))]]), reject
    
    no_initialization = (S_init is None)
    if not no_initialization or k == 0 or k == 1:
        num_inits = 1

    for _ in range(num_inits):
        if no_initialization:
            S_init = np.concatenate([include, np.random.choice(not_include, k-len(include), replace=False)]).astype(int)

        S, reject, log_det = swapping_subset_factor_with_init(Sigma=Sigma,
                                                              S_init=S_init,
                                                              find_minimizer=find_minimizer,
                                                              cutoff=cutoff,
                                                              max_iter=max_iter, 
                                                              include=include,
                                                              exclude=exclude,
                                                              tol=TOL)
        if not find_minimizer and (not reject):
            return S, reject 
        
        if find_minimizer and (not reject):
            reject = reject

        if log_det < best_log_det:
            best_S = S
            best_log_det = log_det 

    return best_S, reject 

In [460]:
def sample_null_dist(n, p, k, B=int(1e5), seed=0):

    if seed is not None:
        np.random.seed(0)

    num_adjusted_samples = n - k - 1
    num_features = p-k
    full_dfs = np.array([num_adjusted_samples - i + 1 for i in range(1, num_features + 1)])
    full_chi_sqs = np.random.chisquare(df=full_dfs, size=(B, len(full_dfs)))

    null_dfs = np.arange(1, num_features)
    null_chi_sqs = np.random.chisquare(df=null_dfs, size=(B, len(null_dfs)))
    null_chi_sqs = np.hstack([np.zeros(B).reshape((B, 1)), null_chi_sqs])
    return n*(np.sum( np.log(null_chi_sqs/full_chi_sqs + 1), axis=1))

def Q(qs, n, p, k, B=int(1e5), seed=0):

    return np.quantile(sample_null_dist(n, p, k, B=B, seed=seed), qs)

def subset_selection(X, 
                     alpha, 
                     include=np.array([]), 
                     exclude=np.array([]), 
                     quantile_dict={}, 
                     B=int(1e5),
                     max_iter=100,
                     num_inits=1,
                     tol=TOL):
    n, p = X.shape
    _, Sigma_hat = get_moments(X)
    Sigma_hat = standardize_cov(Sigma_hat)
    
    crit_vals = np.array([Q(1-alpha, n, p, i, B=B) if (1 - alpha, n, p , i) not in quantile_dict.keys() else quantile_dict[( 1 - alpha, n, p , i)] for i in range(p + 1)])
    cutoffs = crit_vals/n  + np.linalg.slogdet(Sigma_hat)[1]

    S, reject = greedy_subset_factor_selection(Sigma_hat,
                                               cutoffs,
                                               include=include,
                                               exclude=exclude,
                                               tol=tol)
    
    
    
    if reject:
        warnings.warn("We can still reject the model with this S, but nothing more can be added.")
        return S
    if len(S) <= 1:
        return S 


    k = len(S)
    while not reject:
        k = k-1
        S, reject = swapping_subset_factor_selection(Sigma_hat,
                                                     k,
                                                     cutoffs[k],
                                                     max_iter=max_iter,
                                                     num_inits=num_inits,
                                                     include=include,
                                                     exclude=exclude,
                                                     tol=TOL)
        if reject:
            S, reject = swapping_subset_factor_selection(Sigma_hat,
                                                         k+1,
                                                         cutoffs[k+1],
                                                         max_iter=max_iter,
                                                         num_inits=num_inits,
                                                         find_minimizer=True,
                                                         include=include,
                                                         exclude=exclude,
                                                         tol=TOL)
            return S 
            



In [461]:
import pandas as pd
X = pd.read_csv("../data/BFI228.csv").values[:, 1:].astype(int)
n, p = X.shape
_, Sigma_hat = get_moments(X)

alpha = 0.1
quantile_dict = {(1 - alpha, n, p, i,):  Q(1 -alpha, n, p, i,) for i in range(p + 1) }

In [465]:
import time
start = time.time()
S = subset_selection(X, 
                 alpha=alpha,  
                 quantile_dict=quantile_dict, 
                 num_inits=1,
                 exclude = np.arange(p))
end = time.time()
print(end - start)

0.01699209213256836




In [464]:
np.sort(S)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 25, 27, 29, 31, 34, 37, 38, 41, 43])

In [431]:
S

array([ 1,  2, 31, 35,  3, 29, 43,  6, 19, 13, 15, 23, 20, 27, 12, 32, 22,
       38, 25,  0, 41, 24])

In [408]:
print(len(S))

20


In [445]:
k=20
S, reject = swapping_subset_factor_selection(Sigma_hat,
                                 k,
                                 cutoffs[k-1],
                                 max_iter=100,
                                 num_inits=100,  
                                 tol=TOL)

print(np.sort(S))
print(reject)

[ 0  3  6  7 12 13 14 15 17 20 22 24 25 27 29 30 35 38 41 43]
False


In [140]:
Sigma = np.diag(np.array([1, 1,  3, 4, 5]))
Sigma[0, 1] = 1
Sigma[1, 0] = 1
print(Sigma)

[[1 1 0 0 0]
 [1 1 0 0 0]
 [0 0 3 0 0]
 [0 0 0 4 0]
 [0 0 0 0 5]]


In [262]:
swapping_css(Sigma,
           k=3,
           max_iter=100,
           num_inits=5, 
           include=np.array([0]),
           exclude=np.array([]),
           tol=TOL)

(array([3, 0, 4]),
 array([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 3., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]),
 array([0, 3, 2]),
 True)

In [260]:
swapping_css(Sigma,
           k=3,
           max_iter=100,
           num_inits=1, 
           include=np.array([]),
           exclude=np.array([]),
           tol=TOL)

(array([3, 4, 2]),
 array([[1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]),
 array([4, 0, 3]),
 True)

NameError: name 'idxs1' is not defined

In [2]:
Sigma = np.diag(np.eye(10))
idx_order = np.array(np.arange(10))
swap_in_place(Sigma, np.array([8, 9]), np.array([9, 8]), idx_order=idx_order)
idx_order

array([0, 1, 2, 3, 4, 5, 6, 7, 9, 8])

In [None]:
3410, 1034

In [None]:
Sigma[0, 0] = 1
Sigma[1, 1] = 1
Sigma[:2, 2:] = rho
Sigma[2:, :2] = rho 
    Sigma[2:, 2:] = np.diag(np.ones(p-2)) + Sigma[2:, 0] @ Sigma[0, 2:]

    S, reject = swapping_subset_factor_selection(Sigma, k=2, S_init=np.array([0, 1]), cutoff=0, tol=TOL)
    assert(reject == False)
    assert(set(S) == set(np.array([0, 1])))
    

In [22]:
np.array(set([5]))

array({5}, dtype=object)

In [34]:
np.zeros((7, 7))[np.array([4]),:]

array([[0., 0., 0., 0., 0., 0., 0.]])

In [41]:
idxs1 = np.array([2, 4])
idxs2 = np.array([4, 3])

union = set(idxs1).union(set(idxs2))

orig = np.concatenate([idxs1, list(set(union) - set(idxs1)) ])
perm = np.concatenate([idxs2, list(set(union) - set(idxs2)) ])

In [25]:
Sigma = np.diag(np.array([1, 2, 3, 4, 5]))
    
S, Sigma_R = greedy_css(Sigma,k=5)

In [16]:
eps=0.05
p=6
Sigma = np.diag(np.ones(p))
Sigma[2, 0], Sigma[0, 2], Sigma[2, 1], Sigma[1, 2] = 1-eps, 1-eps, 1-eps, 1-eps
Sigma[5, 3], Sigma[3, 5], Sigma[5, 4], Sigma[4, 5] = 1-eps, 1-eps, 1-eps, 1-eps
S_init = np.array([0, 1])
S, _, _, converged = swapping_css(Sigma, k=2, S_init=S_init)
print(S)
print(converged)
S_init = np.array([4, 3])
S, _, _, converged = swapping_css(Sigma, k=2, S_init=S_init)
print(S)
print(converged)

[1 2]
True
[3 5]
True


In [15]:
Sigma

array([[1.  , 0.  , 0.95, 0.  , 0.  , 0.  ],
       [0.  , 1.  , 0.95, 0.  , 0.  , 0.  ],
       [0.95, 0.95, 1.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 1.  , 0.  , 0.95],
       [0.  , 0.  , 0.  , 0.  , 1.  , 0.95],
       [0.  , 0.  , 0.  , 0.95, 0.95, 1.  ]])