Skip to content

Commit

Permalink
asymmetric truncation and verbosity (#27)
Browse files Browse the repository at this point in the history
Add asymmetric truncation to the IPW model.
Also allow for a verbosity parameter to control whether
to print the number of truncated units.
  • Loading branch information
liranszlak authored Jan 16, 2022
1 parent 2636149 commit 6572fae
Show file tree
Hide file tree
Showing 10 changed files with 135 additions and 73 deletions.
86 changes: 53 additions & 33 deletions causallib/estimation/ipw.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,28 +26,31 @@
from ..utils.stat_utils import robust_lookup


# TODO: implement a two-caliper truncation, one lower bound truncation epsilon and an upper bound one.


class IPW(PropensityEstimator, PopulationOutcomeEstimator):
"""
Causal model implementing inverse probability (propensity score) weighting.
w_i = 1 / Pr[A=a_i|Xi]
"""

def __init__(self, learner, truncate_eps=None, use_stabilized=False):
def __init__(self, learner, clip_min=None, clip_max=None, use_stabilized=False, verbose=False):
"""
Args:
learner: Initialized sklearn model.
truncate_eps (None|float): Optional value between 0 to 0.5 to clip the propensity estimation.
Will clip probabilities between clip_eps and 1-clip_eps.
clip_min (None|float): Optional value between 0 to 0.5 to lower bound the propensity estimation by
clipping it. Will clip probabilities under clip_min to this value.
clip_max (None|float): Optional value between 0.5 to 1 to upper bound the propensity estimation by
clipping it. Will clip probabilities above clip_max to this value.
use_stabilized (bool): Whether to re-weigh the learned weights with the prevalence of the treatment.
See Also: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4351790/#S6title
verbose (bool): Whether to print summary statistics regarding the number of samples clipped due to
clip_min and clip_max.
"""
super(IPW, self).__init__(learner, use_stabilized)
self.__check_truncation_value_is_legal(truncate_eps)
self.truncate_eps = truncate_eps
self.__check_truncation_value_is_legal(clip_min, clip_max)
self.clip_min = clip_min
self.clip_max = clip_max
self.verbose = verbose

def fit(self, X, a, y=None):
if self.use_stabilized:
Expand All @@ -62,7 +65,7 @@ def _predict(self, X):
prediction_matrix = pd.DataFrame(prediction_matrix, index=X.index, columns=columns)
return prediction_matrix

def compute_weights(self, X, a, treatment_values=None, truncate_eps=None, use_stabilized=None):
def compute_weights(self, X, a, treatment_values=None, clip_min=None, clip_max=None, use_stabilized=None):
"""
Computes individual weight given the individual's treatment assignment.
w_i = 1 / Pr[A=a_i|X_i] for each individual i.
Expand All @@ -74,8 +77,10 @@ def compute_weights(self, X, a, treatment_values=None, truncate_eps=None, use_st
value should be calculated).
If not specified, then the weights are chosen by the individual's actual
treatment assignment.
truncate_eps (None|float): Optional value between 0 to 0.5 to clip the propensity estimation.
Will clip probabilities between clip_eps and 1-clip_eps.
clip_min (None|float): Optional value between 0 to 0.5 to lower bound the propensity estimation by
clipping it. Will clip probabilities under clip_min to this value.
clip_max (None|float): Optional value between 0.5 to 1 to upper bound the propensity estimation by
clipping it. Will clip probabilities above clip_max to this value.
use_stabilized (None|bool): Whether to re-weigh the learned weights with the prevalence of the treatment.
This overrides the use_stabilized parameter provided at initialization.
If True provided, but the model was initialized with use_stabilized=False, then
Expand All @@ -88,23 +93,25 @@ def compute_weights(self, X, a, treatment_values=None, truncate_eps=None, use_st
n_samples with a weight for each sample is returned.
If treatment_values is a list/array, then a DataFrame is returned.
"""
weight_matrix = self.compute_weight_matrix(X, a, truncate_eps, use_stabilized)
weight_matrix = self.compute_weight_matrix(X, a, clip_min, clip_max, use_stabilized)
if treatment_values is None:
weights = robust_lookup(weight_matrix, a) # lookup table: take the column a[i] for every i in index(a).
else:
weights = weight_matrix[treatment_values]
return weights

def compute_weight_matrix(self, X, a, truncate_eps=None, use_stabilized=None):
def compute_weight_matrix(self, X, a, clip_min=None, clip_max=None, use_stabilized=None):
"""
Computes individual weight across all possible treatment values.
w_ij = 1 / Pr[A=a_j | X_i] for all individual i and treatment j.
Args:
X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features).
a (pd.Series): Treatment assignment of size (num_subjects,).
truncate_eps (None|float): Optional value between 0 to 0.5 to clip the propensity estimation.
Will clip probabilities between clip_eps and 1-clip_eps.
clip_min (None|float): Optional value between 0 to 0.5 to lower bound the propensity estimation by clipping it.
Will clip probabilities under clip_min to this value.
clip_max (None|float): Optional value between 0.5 to 1 to upper bound the propensity estimation by clipping it.
Will clip probabilities above clip_max to this value.
use_stabilized (None|bool): Whether to re-weigh the learned weights with the prevalence of the treatment.
This overrides the use_stabilized parameter provided at initialization.
If True provided, but the model was initialized with use_stabilized=False, then
Expand All @@ -118,7 +125,7 @@ def compute_weight_matrix(self, X, a, truncate_eps=None, use_stabilized=None):
"""
use_stabilized = self.use_stabilized if use_stabilized is None else use_stabilized

probabilities = self.compute_propensity_matrix(X, a, truncate_eps)
probabilities = self.compute_propensity_matrix(X, a, clip_min, clip_max)

# weight_matrix = 1.0 / probabilities # type: pd.DataFrame
weight_matrix = probabilities.rdiv(1.0)
Expand All @@ -137,7 +144,7 @@ def compute_weight_matrix(self, X, a, truncate_eps=None, use_stabilized=None):

return weight_matrix

def compute_propensity(self, X, a, treatment_values=None, truncate_eps=None):
def compute_propensity(self, X, a, treatment_values=None, clip_min=None, clip_max=None):
"""
Args:
Expand All @@ -147,8 +154,10 @@ def compute_propensity(self, X, a, treatment_values=None, truncate_eps=None):
treatment value should be calculated).
If not specified, then the maximal treatment value is chosen. This is since
the usual case is of treatment (A=1) control (A=0) setting.
truncate_eps (None|float): Optional value between 0 to 0.5 to clip the propensity estimation.
Will clip probabilities between clip_eps and 1-clip_eps.
clip_min (None|float): Optional value between 0 to 0.5 to lower bound the propensity estimation by clipping it.
Will clip probabilities under clip_min to this value.
clip_max (None|float): Optional value between 0.5 to 1 to upper bound the propensity estimation by clipping it.
Will clip probabilities above clip_max to this value.
Returns:
pd.DataFrame | pd.Series: A matrix/vector num_subjects rows and number of columns is the number of values
Expand All @@ -160,33 +169,35 @@ def compute_propensity(self, X, a, treatment_values=None, truncate_eps=None):
"""
treatment_values = a.max() if treatment_values is None else treatment_values

probabilities = self.compute_propensity_matrix(X, a, truncate_eps)
probabilities = self.compute_propensity_matrix(X, a, clip_min, clip_max)
probabilities = probabilities[treatment_values]
return probabilities

def compute_propensity_matrix(self, X, a=None, truncate_eps=None):
def compute_propensity_matrix(self, X, a=None, clip_min=None, clip_max=None):
"""
Args:
X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features).
a (pd.Series): Treatment assignment of size (num_subjects,).
truncate_eps (None|float): Optional value between 0 to 0.5 to clip the propensity estimation.
Will clip probabilities between clip_eps and 1-clip_eps.
clip_min (None|float): Optional value between 0 to 0.5 to lower bound the propensity estimation by clipping it.
Will clip probabilities under clip_min to this value.
clip_max (None|float): Optional value between 0.5 to 1 to upper bound the propensity estimation by clipping it.
Will clip probabilities above clip_max to this value.
Returns:
pd.DataFrame: A matrix of size (num_subjects, num_treatments) with probability for every individual and e
very treatment.
"""
truncate_eps = self.truncate_eps if truncate_eps is None else truncate_eps
self.__check_truncation_value_is_legal(truncate_eps)
clip_min = self.clip_min if clip_min is None else clip_min
clip_max = self.clip_max if clip_max is None else clip_max
self.__check_truncation_value_is_legal(clip_min, clip_max)

probabilities = self._predict(X)
if truncate_eps is not None: # since truncation value is legal, it must be a float.
print("Fraction of values being truncated: {:.5f}."
.format(probabilities.apply(lambda x: ~x.between(truncate_eps, 1 - truncate_eps)).sum().sum() /
probabilities.size)) # TODO: do as log

probabilities = probabilities.clip(lower=truncate_eps, upper=1 - truncate_eps)
clip_min = 0 if clip_min is None else clip_min
clip_max = 1 if clip_max is None else clip_max
self.__count_truncated(probabilities, clip_min, clip_max)
probabilities = probabilities.clip(lower=clip_min, upper=clip_max)

return probabilities

Expand Down Expand Up @@ -215,6 +226,15 @@ def estimate_population_outcome(self, X, a, y, w=None, treatment_values=None):
return res

@staticmethod
def __check_truncation_value_is_legal(truncate_eps):
if truncate_eps is not None and not 0 <= truncate_eps <= 0.5:
raise AssertionError("Provided value for truncation (truncate_eps) should be between 0.0 and 0.5")
def __check_truncation_value_is_legal(clip_min, clip_max):
if clip_min is not None and not 0 <= clip_min <= 0.5:
raise AssertionError("Provided value for truncation (clip_min) should be between 0.0 and 0.5")
if clip_max is not None and not 0.5 <= clip_max <= 1:
raise AssertionError("Provided value for truncation (clip_max) should be between 0.5 and 1.0")

def __count_truncated(self, probabilities, clip_min, clip_max):
num_clipped = probabilities.apply(lambda x: ~x.between(clip_min, clip_max)).sum().sum()
fraction_clipped = num_clipped/probabilities.size
if self.verbose:
print(f'Fraction of values being truncated: {fraction_clipped:.5f}.') # TODO: do as log
return fraction_clipped
15 changes: 8 additions & 7 deletions causallib/estimation/overlap_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,16 @@ def __init__(self, learner, use_stabilized=False):
"""
super(OverlapWeights, self).__init__(learner, use_stabilized)

def compute_weight_matrix(self, X, a, truncate_eps=None, use_stabilized=None):
def compute_weight_matrix(self, X, a, clip_min=None, clip_max=None, use_stabilized=None):
"""
Computes individual weight across all possible treatment values.
w_ij = 1 - Pr[A=a_j | X_i] for all individual i and treatment j.
Args:
X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features).
a (pd.Series): Treatment assignment of size (num_subjects,).
truncate_eps (None): Ignored.
clip_min (None|float): Lower bound for propensity scores. Better be left `None`.
clip_max (None|float): Upper bound for propensity scores. Better be left `None`.
use_stabilized (None|bool): Whether to re-weigh the learned weights with the prevalence of the treatment.
This overrides the use_stabilized parameter provided at initialization.
If True provided, but the model was initialized with use_stabilized=False, then
Expand All @@ -78,11 +79,11 @@ def compute_weight_matrix(self, X, a, truncate_eps=None, use_stabilized=None):
use_stabilized = self.use_stabilized if use_stabilized is None else use_stabilized
# Check that number of unique classes is 2
self.__check_number_of_classes_is_legal(a)
# Check truncate_eps is None|False otherwise a warning is printed
self.__check_truncate_eps_is_none(truncate_eps)
# Truncation is generally bad, check and warn:
self.__check_truncation_value_is_none(clip_min, clip_max)

# COmpute propensity scores
probabilities = self.compute_propensity_matrix(X, a, truncate_eps)
probabilities = self.compute_propensity_matrix(X, a, clip_min, clip_max)
# weight matrix: 1-P[a_i=1|x]
# Reverse probabilities to opposite classes:
probabilities.columns = probabilities.columns[::-1] # Flip name-based indexing
Expand Down Expand Up @@ -132,8 +133,8 @@ def __check_number_of_classes_is_legal(x):
raise AssertionError("Number of unique classes should be equal 2")

@staticmethod
def __check_truncate_eps_is_none(truncate_eps):
if truncate_eps is not None:
def __check_truncation_value_is_none(clip_min, clip_max):
if clip_min is not None or clip_max is not None:
warnings.warn(
"Trimming observations with Overlap Weighting may be redundant, "
"as extreme observations can receive greater importance than they should.",
Expand Down
6 changes: 3 additions & 3 deletions causallib/tests/test_doublyrobust.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def ensure_pipeline_learner(self):
self.estimator.estimate_individual_outcome(data["X"], data["a"])
self.assertTrue(True) # Dummy assert for not thrown exception

def ensure_many_models(self, truncate_eps=None):
def ensure_many_models(self, clip_min=None, clip_max=None):
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet, RANSACRegressor, HuberRegressor, PassiveAggressiveRegressor
Expand All @@ -172,7 +172,7 @@ def ensure_many_models(self, truncate_eps=None):
RandomForestClassifier(n_estimators=100),
MLPClassifier(hidden_layer_sizes=(5,)),
KNeighborsClassifier(n_neighbors=20)]:
weight_model = IPW(propensity_learner, truncate_eps=truncate_eps)
weight_model = IPW(propensity_learner, clip_min=clip_min, clip_max=clip_max)
propensity_learner_name = str(propensity_learner).split("(", maxsplit=1)[0]
for outcome_learner in [GradientBoostingRegressor(n_estimators=10), RandomForestRegressor(n_estimators=10),
MLPRegressor(hidden_layer_sizes=(5,)),
Expand Down Expand Up @@ -339,4 +339,4 @@ def test_pipeline_learner(self):
self.ensure_pipeline_learner()

def test_many_models(self):
self.ensure_many_models(truncate_eps=0.001)
self.ensure_many_models(clip_min=0.001, clip_max=1-0.001)
Loading

0 comments on commit 6572fae

Please sign in to comment.