-
Notifications
You must be signed in to change notification settings - Fork 95
/
search.py
164 lines (137 loc) · 7.33 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Interfacing between causallib and sklearn to take advantage of sklearn's
hyperparameter search machinery (e.g., GridSearchCV).
Wraps causallib's models and scorers to look like sklearn's,
and wraps sklearn's hyperparameter search models to look like causallib models.
"""
from typing import Type
from sklearn.model_selection._search import BaseSearchCV
import pandas as pd
from ..metrics.scorers import get_scorer
def _adapt_causal_scorer_to_sklearn(scorer):
"""Wraps a Causal Scorer, whose interface is `estimator, X, a, y, **kwargs`,
with a sklearn-compatible interface of `estimator, Xa, y, **kwargs`.
Args:
scorer (callable): a causallib scorer.
Returns:
score (callable): a scikit-learn score interface for `scorer`.
"""
def score(estimator, joinedXa, y_true, sample_weight=None, **kwargs):
a = joinedXa.iloc[:, -1]
X = joinedXa.iloc[:, :-1]
score_value = scorer(estimator, X, a, y_true, sample_weight=sample_weight, **kwargs)
return score_value
return score
def _adapt_causal_scorers_to_sklearn(scorers):
"""Wraps each causallib scorer in a possible dict/list of them.
Only supports causallib's scorers.
Mostly compatible with `scoring` parameter in:
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
Args:
scorers (str, callable, list, tuple, dict):
A strategy to evaluate the performance a causallib model. can be either single or multiple scores.
If `scorers` represents a single score, one can use:
* A string (see causallib.metrics.get_scorer_names for all available names).
* A callable following the API in causallib.metrics.scorers: `estimator, X, a, y, **kwargs`.
If `scorers` represents multiple scores, one can use:
* A list or tuple of scorer names.
* A dict mapping between metric names and callable scorers (following the API in causallib.metrics.scorers).
Returns:
scores (callable, list, dict): a callable or a list/dict of callables (depending on the input),
of causal scorers compatible with sklearn's scorers' API.
"""
# TODO: adapt only if isinstance of BaseCausalScorer and leave sklearn's scorers untouched?
if isinstance(scorers, dict):
scorers = {name: _adapt_causal_scorer_to_sklearn(scorer)
for name, scorer in scorers.items()}
elif isinstance(scorers, (list, tuple, set)):
scorers = {name: _adapt_causal_scorer_to_sklearn(get_scorer(name))
for name in scorers}
elif isinstance(scorers, str):
scorers = _adapt_causal_scorer_to_sklearn(get_scorer(scorers))
elif callable(scorers):
scorers = _adapt_causal_scorer_to_sklearn(scorers)
else:
raise ValueError(
f"`scoring` is invalid (got {scorers})."
f"Please provide either: "
f"a callable compatible with causallib.metrics.scorers API,"
f"a scorer name from causallib.metrics.get_scorer_names,"
f"a list or tuple of scorer names,"
f"or a dictionary mapping between metric names and valid scorers"
)
return scorers
def _adapt_causal_estimator_to_sklearn(estimator):
"""Wraps a causallib model (type) with interface `fit(X, a, y)`
with a sklearn's `fit(X', y)` interface.
Other than that, it has the same (inference) capabilities as the causallib estimator.
"""
class SklearnCompatibleEstimator(estimator.__class__):
def fit(self, joinedXa, y, *args, **kwargs):
a = joinedXa.iloc[:, -1]
X = joinedXa.iloc[:, :-1]
return super().fit(X, a, y, *args, **kwargs)
@property
def estimator(self):
params = self.get_params()
return estimator.set_params(**params)
SklearnCompatibleEstimator.__name__ = f"SklearnCompatible{estimator.__class__.__name__}"
SklearnCompatibleEstimator.__qualname__ = f"SklearnCompatible{estimator.__class__.__qualname__}"
params = estimator.get_params()
return SklearnCompatibleEstimator(learner=params["learner"]).set_params(**params)
def causalize_searcher(searcher_type: Type[BaseSearchCV]):
"""wraps a hyperparameter search algorithm (like sklearn's GridSearchCV)
with a causallib model interface.
Args:
searcher_type: A class of hyperparameter search algorithm
(e.g., sklearn's GridSearchCV)
Returns:
searcher(searcher_type): a class definition of the provided searcher
with a causallib `fit(X, a, y)` interface and the underlying estimator capabilities.
Examples:
>>> from sklearn.model_selection import GridSearchCV
>>> from sklearn.linear_model import LogisticRegression
>>> from causallib.estimation import IPW
>>> from causallib.metrics import get_scorer
>>> from causallib.datasets import load_nhefs
>>> data = load_nhefs()
>>> CausalGridSearchCV = causalize_searcher(GridSearchCV)
>>> model = IPW(LogisticRegression())
>>> scorer = get_scorer("weighted_roc_auc_error")
>>> param_grid = dict(clip_min=[0.2, 0.3])
>>> grid_model = CausalGridSearchCV(model, param_grid=param_grid, scoring=scorer) # GridSearchCV parameters
>>> grid_model.fit(data.X, data.a, data.y) # causallib interface
>>> grid_model.estimate_population_outcome(data.X, data.a, data.y)
>>> grid_model.compute_propensity(data.X, data.a) # IPW capabilities
"""
class CausalSearcher(searcher_type):
def __init__(self, estimator, *args, **kwargs):
estimator = _adapt_causal_estimator_to_sklearn(estimator)
kwargs["scoring"] = _adapt_causal_scorers_to_sklearn(kwargs["scoring"])
super().__init__(estimator, *args, **kwargs)
def _set_methods_from_estimator(self):
"""Exposes all the methods from the internal `best_estimator_`,
so that the `CausalSearcher` behaves like it's internal causallib model.
"""
# Avoid re-setting and overwriting existing methods:
unique_estimator_attributes = set(dir(self.best_estimator_)) - set(dir(self))
for attr_name in dir(self.best_estimator_):
attr_value = getattr(self.best_estimator_, attr_name, None)
if (
attr_name in unique_estimator_attributes # An estimator attribute that is not unique to searcher
and not attr_name.startswith("__") # not internal method
and callable(attr_value) # the current attribute is a method
and attr_name != "fit" # don't overwrite the GridSearch-like fit below.
):
setattr(self, attr_name, attr_value)
def fit(self, X, a, y, *, groups=None, **fit_params):
joinedXa = pd.concat([X, a], axis=1)
super().fit(joinedXa, y, groups=groups, **fit_params)
# if hasattr(self, "best_estimator_"):
# self.best_estimator_ = self.estimator.estimator
# # Should `best_estimator_` be original causallib model or adapted?
self._set_methods_from_estimator()
return self
CausalSearcher.__name__ = f"Causal{searcher_type.__name__}"
CausalSearcher.__qualname__ = f"Causal{searcher_type.__qualname__}"
return CausalSearcher