In [1]:
# Run sslasso from Python notebook
import os
import re
import pandas as pd
import numpy as np
from pathlib import Path
import subprocess
from typing import Union, Any

In [2]:
os.listdir('/home/siebenschuh/Projects/Optimization_under_uncertainty/Experiments/Tmp')

[]

## Use code below for `SSLASSO` regression model

In [38]:
def parse_R_alpha(output:str):
    """
    Parses the str-output of the SSLASSO method into the coefficient vector estimate `alpha` of length 1+p (incl. intercept)
    
    Args:
        output (str)    :     String output from the R script running the SSLASSO routine representing the numeric vector. 
    Returns:
        A 1-d NumPy array corresponding to the parameter estimate.
    Raises:
        AssertionError  :     If resulting 
    """
    
    # remove square bracket integers and newlines
    cleaned_str = re.sub(r'\[\d+\]|\n', '', output)

    # extract numerical values
    alpha_hat = np.array([float(val) for val in re.findall(r'-?\d+\.\d+', cleaned_str)])
    
    assert alpha_hat.dtype==float, "Parsed numpy array should be float vector."
    
    return alpha_hat



def get_alpha_MAP(X:np.array,
                  y:np.array,
                  verbose:bool              = False, 
                  tmp_dir:Union[Path, str]  = './tmp',
                  script_name:str           = 'sslasso_map.r',
                  conda_env:str             = 'R',
                  conda_path:str            = '/soft/datascience/conda/2022-09-08/mconda3/condabin/conda'):
    """
    Wraps the MAP SSLASSO routine in R. Runs max a-posterior estimation for Spike & Slab LASSO returning an estimate for the coefficient vector alpha.
    (Source: https://cran.r-project.org/web/packages/SSLASSO/SSLASSO.pdf)
    
    Args:
        - X (np.array)       :    Design matrix (exogenous)
        - y (np.array)       :    Vector of observations corresponding to X
        - verbose (bool)     :    Indicates if more elaborate output is desired.
        - tmp_dir (Path)     :    Directory in which `X` and `y` are stored so the R script can pick them up.
        - script_name (str)  :    File name of the R script that runs SSLASSO.
     
    Returns:
        - 1d NumPy array representing the MAP of the coefficinet vector alpha.
    
    Raises:
        - AssertionError
    """

    # check input format
    assert X.ndim==2, "Design matrix `X` should be a 2d numpy array."
    assert y.ndim==1, "Vector of responses `y` should be a 1d numpy array."
    assert len(X)==len(y), "Lengths of design matrix `X` and observation vector `y` must coincide."
    assert os.path.isdir(tmp_dir), "Directory to store `X`, `y` temporarily does not exist."
    assert os.path.isfile(script_name), f"The R script `{script_name}` does not exist."
    assert conda_env in ['R'], "Only valid conda env containing R utilities is `R`."
    assert os.path.isfile(conda_path), f"`conda_path` is invalid. {conda_path} does not exist"

    # store data temporarily
    # - paths
    X_path = Path(tmp_dir) / 'X_tmp.txt'
    y_path = Path(tmp_dir) / 'y_tmp.txt'
    
    # - store
    np.savetxt(X_path, X)
    np.savetxt(y_path, y)
    
    # run R script (incl. conda environment activation)
    command = [
        conda_path, 'run', '-n', conda_env, 'Rscript',
        script_name, X_path, y_path
    ]

    # Run the command and capture the output
    output = None
    try:
        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
        output = result.stdout
        error_output = result.stderr
    except subprocess.CalledProcessError as e:
        error_output = e.stderr
        
    # delete tmp files
    for file_path in [X_path, y_path]:
        if(os.path.isfile(file_path)):
            os.remove(file_path)

    # Display the captured output
    if(output):
        alpha = parse_R_alpha(output)
        if(verbose):
            print("Output ok.")
        return alpha
   
    if(verbose):
        print("Error Output:")
        print(error_output)
    
    return 
    


In [39]:
## Linear regression, where p > n
p = 1000
n = 150

# design matrix
np.random.seed(5858)
X = np.random.normal(loc=0,scale=3, size=n*(p+1)).reshape(n,-1)
X[:,0]=1

# alpha
alpha     = np.random.normal(loc=0, scale=0.001, size=p+1).round(4)
alpha[1]  = 4.0
alpha[17] = -3.75
alpha[36] = 2.5

# simulate y (incl. additive Gaussian noise)
y = np.random.normal(X @ alpha, scale=1.0)


In [50]:
Path('../Tmp').absolute()

PosixPath('/home/siebenschuh/Projects/Optimization_under_uncertainty/Experiments/Sparse_Bayes_Reg/../Tmp')

In [55]:
os.path.isdir('/home/siebenschuh/Projects/Optimization_under_uncertainty/Experiments/Sparse_Bayes_Reg/Experiment')

False

In [40]:
alpha_hat = get_alpha_MAP(X=X, y=y)

In [41]:
alpha_hat[np.abs(alpha_hat)>1]

array([ 3.91651 , -3.697825,  2.414793])