# Ignore me - go to My_HAL_sims.ipynb

In [None]:
# This is the Python implementation of the HAL algorithm.

from collections import defaultdict
from itertools import chain
from sklearn.linear_model import LassoCV
import numpy as np
import pandas as pd


# Main HAL implementation

In [4]:

def quantize_col(x, k):
    """
    Quantizes the values in array x into k equally spaced bins.

    Parameters:
    x (ndarray): Input array to be quantized.
    k (int): Number of bins to quantize the values into.

    Returns:
    ndarray: Quantized array with values replaced by their corresponding bin values.
    """
    if k == 0:
        return np.full_like(x, np.min(x))
    quantiles = np.quantile(x, np.arange(0, 1, 1/k))
    indices = np.searchsorted(quantiles, x, side='right') - 1
    return quantiles[indices]

def quantize(X, k):
    """
    Quantizes the input matrix X by reducing the number of unique values in each column to k.

    Parameters:
    X (numpy.ndarray): The input matrix of shape (n_samples, n_features).
    k (int): The desired number of unique values in each column after quantization.

    Returns:
    numpy.ndarray: The quantized matrix of shape (n_samples, n_features).
    """
    if k >= X.shape[0]:
        return X
    return np.stack([quantize_col(x, k) for x in X.T]).T

class HAL:
    """
    HAL (Hierarchical Adaptive Lasso) class for feature selection and prediction.

    Parameters:
    - bin_depths (dict): A dictionary specifying the bin depths for each number of bins. 
                         The keys represent the number of bins, and the values are lists of depths.
                         If None, the default bin depths are {np.inf: []}.
    - sparse_cutoff (float): The cutoff value for sparsity. Default is None.
    - filter (bool): Whether to apply feature filtering. Default is False.
    - **kwargs: Additional keyword arguments to be passed to the LassoCV class.

    Methods:
    - fit_val(X, Y): Fit the HAL model on the training data X and target variable Y.
    - predict(X): Make predictions using the HAL model on the input data X.
    """
    def __init__(self, bin_depths=None, sparse_cutoff=None, filter=False, **kwargs):
        self.lasso = LassoCV(**kwargs)
        self.sparse_cutoff = sparse_cutoff
        self.filter = filter
        self.filter_idx = slice(None)
        if bin_depths is None: # {n_bin: (depths)}
            self.bin_depths = {np.inf: []}
        else:
            self.bin_depths = bin_depths

    @classmethod
    def _basis_products(cls, one_way, max_depth=None, index=0, basis=None, bases=None):
        if max_depth is None:
            max_depth = len(one_way)
        if bases is None:
            bases = defaultdict(list)
        if basis is None:
            basis = np.ones_like(one_way[0], dtype=bool)

        if index == len(one_way) or max_depth == 0:
            bases[max_depth].append(basis)
        else:
            cls._basis_products(one_way, max_depth-1, index+1, basis & one_way[index], bases)
            cls._basis_products(one_way, max_depth,   index+1, basis,                  bases)
        return bases

    @classmethod
    def _one_way(cls, X, knots):
        return np.stack([
            np.less_equal.outer(knots[:,j], X[:,j])
            for j in range(knots.shape[1])
        ])

    @classmethod
    def bases(cls, X, knots, depths):
        if len(depths)==0: # [] represents all depths
            depths = range(1, X.shape[1]+1)
        bases = cls._basis_products(cls._one_way(X, knots), max(depths))
        return np.concatenate(list(chain.from_iterable(
            [bases[max(depths)-d] for d in depths]
        )))

    def multibases(self, X):
        return np.concatenate([
            self.bases(X, knots, depths)
            for (depths, knots) in self.knots.items()
        ]).T

    def filter_bases(self, H):
        if self.sparse_cutoff is None:
            self.sparse_cutoff = 1/np.sqrt(H.shape[0])
        H, filter_idx = np.unique(H, return_index=True, axis=1)
        pct1 = np.mean(H, axis=0)
        keep = (self.sparse_cutoff < pct1) & (pct1 < 1-self.sparse_cutoff)
        self.filter_idx = filter_idx[keep]
        return H[:,keep]

    def fit_val(self, X, Y):
        self.knots = {
            tuple(depths): quantize(X, n_bin)
            for (n_bin, depths) in self.bin_depths.items()
        }
        H = self.multibases(X)
        if self.filter:
            H = self.filter_bases(H)

        self.lasso.fit(H, Y)

    def predict(self, X):
        H = self.multibases(X)[:,self.filter_idx]
        return self.lasso.predict(H)

def HAL9001(**kwargs):
    """
    This function creates an instance of the HAL class with predefined bin depths.
    
    Parameters:
    - kwargs: Additional keyword arguments to be passed to the HAL constructor.
    
    Returns:
    - An instance of the HAL class.
    """
    return HAL(bin_depths = {200/2**(d-1):[d] for d in range(1,4)}, **kwargs)


# Data generator functions

In [1]:
## Generate simple 2D data

def generate_multivariate_data(n=500, d=2):
    """
    Generates data with multivariate inputs and a univariate output based on specified parameters.
    
    Parameters:
    - n: Number of samples.
    - d: Dimensionality of the input data (X).
    
    Returns:
    A pandas DataFrame with columns ['X1', 'X2', ..., 'Xd', 'Y'] for d-dimensional X and univariate Y.
    """
    np.random.seed(0)  # For reproducibility
    
    # Generate independent variables with d dimensions
    X = np.random.uniform(-4, 4, size=(n, d))
    
    # Generate noise
    epsilon = np.random.normal(0, 1, size=(n, 1))
    
    # Generate dependent variable, using magnitude of X vector for d > 1
    if d > 1:
        # Calculate magnitude of X
        X_magnitude = np.linalg.norm(X, axis=1).reshape(-1, 1)
        Y = 2 * np.sin(np.pi / 2 * np.abs(X_magnitude)) + epsilon
    else:
        # Use absolute value of X directly for d = 1
        Y = 2 * np.sin(np.pi / 2 * np.abs(X)) + epsilon
    
    # Creating DataFrame
    column_names = ['X' + str(i+1) for i in range(d)] + ['Y']
    data = pd.DataFrame(np.hstack((X, Y)), columns=column_names)
    
    return data


In [None]:
# Generate the data with d=2
data_2d = generate_multivariate_data(d=2)

# Display the first few rows of the dataframe
data_2d.head()


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y
0,-1.188975,-1.107711,0.13111,0.479788,2.158754,0.673468,-0.023772,-0.522301,-0.376051,2.385984,65.052001
1,0.304678,0.873407,-0.22404,1.137279,-0.158486,-0.167797,0.329299,0.095985,0.882807,1.48014,227.463721
2,0.606542,-0.735028,-0.528557,-0.071976,0.379535,-0.038572,-1.376957,1.863931,-1.431093,1.100499,-207.454736
3,-0.799108,-0.557484,-0.790976,-0.832768,0.909953,0.982595,0.172157,-0.927648,0.124563,-0.392564,-63.067181
4,0.588897,-0.32864,1.733899,-0.891965,0.699899,0.481374,3.033027,0.282997,0.449477,-0.306869,227.288541


In [23]:
from sklearn.model_selection import train_test_split

## Create feature and target arrays:
# X = everything EXCEPT the Y1 colums (features)
X = regression_data.drop("Y", axis=1).values
# y = the Y1 column (target)
y = regression_data["Y"].values

## Split the data into training and test sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [24]:
hal = HAL()

# Fit the model to the training data
hal.fit_val(X_train, y_train)

# Make predictions on the test data
y_pred = hal.predict(X_test)

# Evaluate the model's performance
from sklearn.metrics import mean_squared_error

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Print the mean squared error
print(f'Mean Squared Error: {mse:.2f}')

Mean Squared Error: 3196.66
