From d52d8b272fbc35f387e2365e5bdcbd1efe38335b Mon Sep 17 00:00:00 2001
From: Greenewald <greenewk@umich.edu>
Date: Wed, 31 May 2023 09:54:53 -0400
Subject: [PATCH 1/6] Update shared_sparsity_selection.py

Updates to improve robustness and quality-of-life:
-Addition of a max_norm constraint to avoid overflow/nonsparse outcomes given the nonconvex regularizer (involves a minor change to the optimization algorithm). This was a parameter that the theory had included, but we had omitted this from the original version since it only seems to matter in extreme circumstances where the hyperparameters are poorly chosen. There is also an auto option here for easy use, as well as none to ignore.
-Addition of an auto option for the mcp_alpha parameter, since this may be somewhat mysterious to some.
-Addition of an include_bias option defaulted to True. This subtracts off the mean of each group of covariates and outcomes (something we typically did outside the module), which is equivalent to adding an unregularized bias term to the regressions.
-The replacement of the auto option for mcp_lambda with an option proportional_lambda, which, if set to True, interprets the value of mcp_lambda as a constant of proportionality to the previous auto option. This is because I found in practice, the coefficient of the auto option pretty much always needs to be tuned, but that the analytic expression it uses was extremely helpful otherwise.
---
 .../shared_sparsity_selection.py              | 75 +++++++++++++++----
 1 file changed, 62 insertions(+), 13 deletions(-)

diff --git a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
index 4ecd7934..9536bbb0 100644
--- a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
+++ b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
@@ -11,11 +11,10 @@
 
 class MCPSelector:
     # TODO: transpose `theta` and rename it `coef_` to align with sklearn models
-    def __init__(self, lmda="auto", alpha=1, step=0.1, max_iter=1000, tol=1e-3):
+    def __init__(self, lmda=0.01, alpha="auto", max_norm = "auto", step=0.1, max_iter=1000, tol=1e-3, proportional_lambda=True, include_bias=True):
         """Constructor for MCPSelector. This class computes shared
         sparsity matrix using proximal gradient descent applied with
         MCP regularizer.
-
         Args:
             lmda (str|float): Parameter (>= 0) to control shape of MCP regularizer.
                 The bigger the value the stronger the regularization.
@@ -27,32 +26,62 @@ def __init__(self, lmda="auto", alpha=1, step=0.1, max_iter=1000, tol=1e-3):
             tol (float): Stopping criterion for MCP. If the normalized value of
                 proximal gradient is less than tol then the algorithm is assumed
                 to have converged.
+            proportional_lambda (Bool): Interpret mcp_lambda parameter as a constant of proportionality for the theory-driven optimal lambda rate. Makes tuning simpler across changing 
+                numbers of samples, covariates, actions, etc.
+            include_bias (Bool): If true, include bias terms in the regression that are not regularized.
         """
         super().__init__()
         self.alpha = alpha
+        self.R = max_norm
         self.lmda = lmda
+        self.lmda_proportional = proportional_lambda
         self.step = step
         self.max_iter = max_iter
         self.tol = tol
         self.epsilon_safe_division = 1e-6
+        self.include_bias = True
 
     def _initialize_internal_params(self, X, a, y):
         treatments = list(a.unique())
         n_treatments = len(treatments)
         n_confounders = X.shape[1]
-        if self.lmda == "auto":
-            avg_num_samples = len(y) / n_treatments
-            lmda = 0.2 * np.sqrt(n_treatments * np.log(n_confounders) / avg_num_samples)
+        avg_num_samples = len(y) / n_treatments
+        if self.lmda_proportional:
+            lmda = self.lmda * np.sqrt(n_treatments * np.log(n_confounders) / avg_num_samples)
         else:
             lmda = self.lmda
 
-        assert self.alpha >= 0
+        if self.alpha == "auto":
+            min_var = np.Inf
+            for i,t in enumerate(treatments):
+                u = X.loc[a == t].values.T
+                
+                tmp = np.min(np.std(u,axis=1))
+                if tmp > 0:
+                    min_var = min(min_var, tmp)
+            alpha = 1 / (.25 * min_var)
+        else:
+            alpha = self.alpha
+        
+        if self.alpha == 0 or self.R == "none":
+            R = np.inf
+        else:
+            if self.R == "auto":
+                R = 1 * np.sqrt(n_treatments) / (8 * lmda)
+            else:
+                R = self.R
+
+        assert alpha >= 0
+        assert R > 0
         assert lmda >= 0
 
         corr_xx = np.zeros((n_confounders, n_confounders, n_treatments))
         corr_xa = np.zeros((n_confounders, n_treatments))
         for i, t in enumerate(treatments):
             u, v = X.loc[a == t].values.T, y.loc[a == t].values
+            if self.include_bias:
+                u = u - np.mean(u,axis=1,keepdims=True) @ np.ones((1,u.shape[1]))
+                v = v - np.mean(v)
             corr_xx[:, :, i] = (u @ u.T) / len(v)
             corr_xa[:, i] = (u @ v) / len(v)
         self.theta_ = np.zeros((n_confounders, n_treatments))
@@ -60,13 +89,17 @@ def _initialize_internal_params(self, X, a, y):
         self.n_confounders_ = n_confounders
         self.n_treatments_ = n_treatments
         self.lmda_ = lmda
+        self.R_ = R
+        self.alpha_ = alpha
 
-    def _mcp_q_grad(self, t):
+        
+    #Minimax Convex Penalty (MCP) regularizer
+    def _mcp_q_grad(self, t): 
         if self.alpha == 0:
             return np.sign(t) * self.lmda_ * (-1)
         else:
             return (np.sign(t) * self.lmda_
-                    * np.maximum(-1, - np.abs(t) / (self.lmda_ * self.alpha)))
+                    * np.maximum(-1, - np.abs(t) / (self.lmda_ * self.alpha_)))
 
     def _update_grad_op(self, theta):
         theta_new = np.zeros_like(theta)
@@ -83,9 +116,19 @@ def _update_grad_op(self, theta):
 
     def _update_proximal_op(self, theta):
         norm_theta = np.linalg.norm(theta, axis=1, keepdims=True)
+        # Do Proximal operator, bearing in mind max_norm constraint
         shrink = np.maximum(0, norm_theta - self.lmda_)
+        if np.sum(shrink) > self.R_:
+            #first take a large safe step
+            shrink = np.maximum(0, shrink - (np.sum(shrink) - self.R_)/len(shrink) )
+        #Then iteratively step until constraint is satisfied
+        while np.sum(shrink) > self.R_:
+            shrink = np.maximum(0, shrink - .05 * self.lmda_)
+        
         theta_proximal_op = theta * ((shrink / norm_theta)
                                      @ np.ones((1, self.n_treatments_)))
+        
+    
         return theta_proximal_op
 
     def compute_shared_sparsity_matrix(self, X, a, y):
@@ -114,16 +157,16 @@ class SharedSparsityConfounderSelection(_BaseConfounderSelection):
     Method by Greenewald, Katz-Rogozhnikov, and Shanmugam: https://arxiv.org/abs/2011.01979
     """
 
-    def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=1e-3, threshold=1e-6,
+    def __init__(self, mcp_lambda=0.01, mcp_alpha="auto", max_norm = "auto", step=0.1, max_iter=1000, tol=1e-3, threshold=1e-6,proportional_lambda=True,include_bias = True,
                  importance_getter=None, covariates=None):
         """Constructor for SharedSparsityConfounderSelection
-
         Args:
-            mcp_lambda (str|float): Parameter (>= 0) to control shape of MCP regularizer.
+            mcp_lambda (str|float): Parameter (>= 0) to control shape of Minimax Convex Penalty (MCP) regularizer.
                 The bigger the value the stronger the regularization.
-                "auto" will auto-select good regularization value.
             mcp_alpha (float): Associated lambda parameter (>= 0) to control shape of MCP regularizer.
                 The smaller the value the stronger the regularization.
+            max_norm (float): Maximum (1,2)-norm allowed for coefficient vector, used if alpha \neq 0. Decrease if the optimization does not converge.
+                "auto" will auto-select a value. "none" means do not use.
             step (float): Step size for proximal gradient, equivalent of learning rate.
             max_iter (int): Maximum number of iterations of MCP proximal gradient.
             tol (float): Stopping criterion for MCP. If the normalized value of
@@ -132,6 +175,9 @@ def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=
             threshold (float): Only if the importance of a confounder exceeds
                 threshold for all values of treatments, then the confounder
                 is retained by transform() call.
+            proportional_lambda (Bool): Interpret mcp_lambda parameter as a constant of proportionality for the theory-driven optimal lambda rate. Makes tuning simpler across changing 
+                numbers of samples, covariates, actions, etc.
+            include_bias (Bool): If True, include bias terms in the regressions that are not regularized.
             importance_getter: IGNORED.
             covariates (list | np.ndarray): Specifying a subset of columns to perform selection on.
                 Columns in `X` but not in `covariates` will be included after `transform`
@@ -140,13 +186,14 @@ def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=
                 or anything compatible with pandas `loc` function for columns.
                 if `None` then all columns are participating in the selection process.
                 This is similar to using sklearn's `ColumnTransformer` or `make_column_selector`.
+            
         """
         super().__init__(importance_getter=importance_getter, covariates=covariates)
         self.step = step
         self.max_iter = max_iter
         self.tol = tol
         self.threshold = threshold
-        self.selector_ = MCPSelector(lmda=mcp_lambda, alpha=mcp_alpha, step=step, max_iter=max_iter, tol=tol)
+        self.selector_ = MCPSelector(lmda=mcp_lambda, alpha=mcp_alpha, max_norm = max_norm, step=step, max_iter=max_iter, tol=tol, proportional_lambda=True,include_bias=include_bias)
 
         self.importance_getter = lambda e: e.selector_.theta_.transpose()  # Shape to behave like sklearn linear_model
         # self.importance_getter = "selector_.theta_"
@@ -174,3 +221,5 @@ def fit(self, X, a, y):
 
     def _get_support_mask(self):
         return self.support_
+
+

From 855c11f85614d3a0537576c13ee4bf98e13d5a99 Mon Sep 17 00:00:00 2001
From: Greenewald <greenewk@umich.edu>
Date: Wed, 31 May 2023 09:59:23 -0400
Subject: [PATCH 2/6] Update shared_sparsity_selection.py

edit parameter description
---
 .../shared_sparsity_selection/shared_sparsity_selection.py       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
index 9536bbb0..3192dda4 100644
--- a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
+++ b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
@@ -18,7 +18,6 @@ def __init__(self, lmda=0.01, alpha="auto", max_norm = "auto", step=0.1, max_ite
         Args:
             lmda (str|float): Parameter (>= 0) to control shape of MCP regularizer.
                 The bigger the value the stronger the regularization.
-                "auto" will auto-select good regularization value.
             alpha (float): Associated lambda parameter (>= 0) to control shape of MCP regularizer.
                 The smaller the value the stronger the regularization.
             step (float): Step size for proximal gradient, equivalent of learning rate.

From 1f95ebe14f06604f3df51606a90ecbcb293bcca6 Mon Sep 17 00:00:00 2001
From: Ehud-Karavani <ehud.karavani@ibm.com>
Date: Mon, 29 Jul 2024 11:37:27 +0300
Subject: [PATCH 3/6] Replace `np.Inf` with `np.inf` to satisfy numpy 2

---
 .../shared_sparsity_selection/shared_sparsity_selection.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
index 3192dda4..f0e6d80a 100644
--- a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
+++ b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
@@ -51,7 +51,7 @@ def _initialize_internal_params(self, X, a, y):
             lmda = self.lmda
 
         if self.alpha == "auto":
-            min_var = np.Inf
+            min_var = np.inf
             for i,t in enumerate(treatments):
                 u = X.loc[a == t].values.T
                 

From 484b5b667160a064ec161554e18380a184c7a48b Mon Sep 17 00:00:00 2001
From: Ehud-Karavani <ehud.karavani@ibm.com>
Date: Mon, 29 Jul 2024 11:55:42 +0300
Subject: [PATCH 4/6] Remove "auto" lambda test as it is no longer supported

Commit d52d8b2 (and 855c11f) removed support of `lmda="auto"`
in favor of `proportional_lambda` and `alpha` auto settings.
---
 .../shared_sparsity_selection.py                       |  2 +-
 .../contrib/tests/test_shared_sparsity_selection.py    | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
index f0e6d80a..05b1c78a 100644
--- a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
+++ b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
@@ -16,7 +16,7 @@ def __init__(self, lmda=0.01, alpha="auto", max_norm = "auto", step=0.1, max_ite
         sparsity matrix using proximal gradient descent applied with
         MCP regularizer.
         Args:
-            lmda (str|float): Parameter (>= 0) to control shape of MCP regularizer.
+            lmda (float): Parameter (>= 0) to control shape of MCP regularizer.
                 The bigger the value the stronger the regularization.
             alpha (float): Associated lambda parameter (>= 0) to control shape of MCP regularizer.
                 The smaller the value the stronger the regularization.
diff --git a/causallib/contrib/tests/test_shared_sparsity_selection.py b/causallib/contrib/tests/test_shared_sparsity_selection.py
index 1c486310..29a17a6d 100644
--- a/causallib/contrib/tests/test_shared_sparsity_selection.py
+++ b/causallib/contrib/tests/test_shared_sparsity_selection.py
@@ -85,11 +85,11 @@ def test_alphas(self):
     def test_lambdas(self):
         X, a, y = self.make_xay(6, 4, n_samples=100, seed=1)
 
-        with self.subTest("Automatic (default) lambda"):
-            sss = SharedSparsityConfounderSelection(mcp_lambda="auto")
-            sss.fit(X, a, y)
-            expected = 0.2 * np.sqrt(2 * np.log(X.shape[1]) / (X.shape[0] / 2))
-            self.assertAlmostEqual(sss.selector_.lmda_, expected)
+        # with self.subTest("Automatic (default) lambda"):
+        #     sss = SharedSparsityConfounderSelection(mcp_lambda="auto")
+        #     sss.fit(X, a, y)
+        #     expected = 0.2 * np.sqrt(2 * np.log(X.shape[1]) / (X.shape[0] / 2))
+        #     self.assertAlmostEqual(sss.selector_.lmda_, expected)
 
         with self.subTest("Pre-specified lambda"):
             lmda = 2.1

From 9a41473d75387cb41079f9a0d88c8bc3caee9084 Mon Sep 17 00:00:00 2001
From: Ehud-Karavani <ehud.karavani@ibm.com>
Date: Mon, 29 Jul 2024 12:32:14 +0300
Subject: [PATCH 5/6] Pass `proportional_lambda` from `SharedSparsity` to
 `MCPSelector`

Fixes a bug where `proportional_lambda` was always set to `True`
in `MCPSelector` regardless of the value passed to the actual
user-instantiated `SharedSparsityConfounderSelection` model.
---
 .../shared_sparsity_selection/shared_sparsity_selection.py     | 2 +-
 causallib/contrib/tests/test_shared_sparsity_selection.py      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
index 05b1c78a..9ce6048e 100644
--- a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
+++ b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
@@ -192,7 +192,7 @@ def __init__(self, mcp_lambda=0.01, mcp_alpha="auto", max_norm = "auto", step=0.
         self.max_iter = max_iter
         self.tol = tol
         self.threshold = threshold
-        self.selector_ = MCPSelector(lmda=mcp_lambda, alpha=mcp_alpha, max_norm = max_norm, step=step, max_iter=max_iter, tol=tol, proportional_lambda=True,include_bias=include_bias)
+        self.selector_ = MCPSelector(lmda=mcp_lambda, alpha=mcp_alpha, max_norm = max_norm, step=step, max_iter=max_iter, tol=tol, proportional_lambda=proportional_lambda,include_bias=include_bias)
 
         self.importance_getter = lambda e: e.selector_.theta_.transpose()  # Shape to behave like sklearn linear_model
         # self.importance_getter = "selector_.theta_"
diff --git a/causallib/contrib/tests/test_shared_sparsity_selection.py b/causallib/contrib/tests/test_shared_sparsity_selection.py
index 29a17a6d..440e46ae 100644
--- a/causallib/contrib/tests/test_shared_sparsity_selection.py
+++ b/causallib/contrib/tests/test_shared_sparsity_selection.py
@@ -93,7 +93,8 @@ def test_lambdas(self):
 
         with self.subTest("Pre-specified lambda"):
             lmda = 2.1
-            sss = SharedSparsityConfounderSelection(mcp_lambda=lmda)
+            # `proportional_lambda=False` should take lambda as is without changing it
+            sss = SharedSparsityConfounderSelection(mcp_lambda=lmda, proportional_lambda=False)
             sss.fit(X, a, y)
             self.assertEqual(sss.selector_.lmda_, lmda)
 

From b76303ec9915694c46a20432f100f707a684aae1 Mon Sep 17 00:00:00 2001
From: Ehud-Karavani <ehud.karavani@ibm.com>
Date: Mon, 29 Jul 2024 12:39:27 +0300
Subject: [PATCH 6/6] Test lambda reduction under reasonable settings and
 `proportional_lambda`

---
 causallib/contrib/tests/test_shared_sparsity_selection.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/causallib/contrib/tests/test_shared_sparsity_selection.py b/causallib/contrib/tests/test_shared_sparsity_selection.py
index 440e46ae..1378fbf5 100644
--- a/causallib/contrib/tests/test_shared_sparsity_selection.py
+++ b/causallib/contrib/tests/test_shared_sparsity_selection.py
@@ -108,6 +108,13 @@ def test_lambdas(self):
             strong = SharedSparsityConfounderSelection(mcp_lambda=1).fit(X, a, y).selector_.theta_
             self.assertLess(np.linalg.norm(strong), np.linalg.norm(weak))
 
+        with self.subTest("Shrinkage under n>p binary treatment setting with proportional lambda"):
+            lmda = 1
+            # `proportional_lambda=True` should reduce lambda
+            sss = SharedSparsityConfounderSelection(mcp_lambda=lmda, proportional_lambda=True)
+            sss.fit(X, a, y)
+            self.assertLess(sss.selector_.lmda_, lmda)
+
     def test_max_iter(self):
         X, a, y = self.make_xay(6, 4, n_samples=100, seed=1)