diff --git a/doubleml/__init__.py b/doubleml/__init__.py index cb3891ba..06d4cd96 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -15,6 +15,7 @@ from .irm.ssm import DoubleMLSSM from .plm.lplr import DoubleMLLPLR from .plm.pliv import DoubleMLPLIV +from .plm.plpr import DoubleMLPLPR from .plm.plr import DoubleMLPLR from .utils.blp import DoubleMLBLP from .utils.policytree import DoubleMLPolicyTree @@ -44,6 +45,7 @@ "DoubleMLPolicyTree", "DoubleMLSSM", "DoubleMLLPLR", + "DoubleMLPLPR", ] __version__ = importlib.metadata.version("doubleml") diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py index 22aad0f7..9a113ebc 100644 --- a/doubleml/data/panel_data.py +++ b/doubleml/data/panel_data.py @@ -41,6 +41,13 @@ class DoubleMLPanelData(DoubleMLData): The instrumental variable(s). Default is ``None``. + static_panel : bool + Indicates whether the data model corresponds to a static + panel data approach (``True``) or to staggered adoption panel data + (``False``). In the latter case, the treatment groups/values are defined in terms of the first time of + treatment exposure. + Default is ``False``. + use_other_treat_as_covariate : bool Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. Default is ``True``. @@ -82,21 +89,29 @@ def __init__( id_col, x_cols=None, z_cols=None, + static_panel=False, use_other_treat_as_covariate=True, force_all_x_finite=True, datetime_unit="M", ): DoubleMLBaseData.__init__(self, data) + self._static_panel = static_panel + # we need to set id_col (needs _data) before call to the super __init__ because of the x_cols setter self.id_col = id_col - self._datetime_unit = _is_valid_datetime_unit(datetime_unit) self._set_id_var() - # Set time column before calling parent constructor self.t_col = t_col + self._datetime_unit = _is_valid_datetime_unit(datetime_unit) + + if not self.static_panel: + cluster_cols = None + force_all_d_finite = False + else: + cluster_cols = id_col + force_all_d_finite = True - # Call parent constructor DoubleMLData.__init__( self, data=data, @@ -104,9 +119,10 @@ def __init__( d_cols=d_cols, x_cols=x_cols, z_cols=z_cols, + cluster_cols=cluster_cols, use_other_treat_as_covariate=use_other_treat_as_covariate, force_all_x_finite=force_all_x_finite, - force_all_d_finite=False, + force_all_d_finite=force_all_d_finite, ) # reset index to ensure a simple RangeIndex @@ -115,15 +131,15 @@ def __init__( # Set time variable array after data is loaded self._set_time_var() - if self.n_treat != 1: - raise ValueError("Only one treatment column is allowed for panel data.") - self._check_disjoint_sets_id_col() # intialize the unique values of g and t self._g_values = np.sort(np.unique(self.d)) # unique values of g self._t_values = np.sort(np.unique(self.t)) # unique values of t + if self.n_treat != 1: + raise ValueError("Only one treatment column is allowed for panel data.") + def __str__(self): data_summary = self._data_summary_str() buf = io.StringIO() @@ -146,6 +162,7 @@ def _data_summary_str(self): f"Instrument variable(s): {self.z_cols}\n" f"Time variable: {self.t_col}\n" f"Id variable: {self.id_col}\n" + f"Static panel data: {self.static_panel}\n" ) data_summary += f"No. Unique Ids: {self.n_ids}\n" f"No. Observations: {self.n_obs}\n" @@ -296,6 +313,11 @@ def n_t_periods(self): """ return len(self.t_values) + @property + def static_panel(self): + """Indicates whether the data model corresponds to a static panel data approach.""" + return self._static_panel + def _get_optional_col_sets(self): base_optional_col_sets = super()._get_optional_col_sets() id_col_set = {self.id_col} diff --git a/doubleml/data/tests/test_panel_data.py b/doubleml/data/tests/test_panel_data.py index d8506b0d..0698368c 100644 --- a/doubleml/data/tests/test_panel_data.py +++ b/doubleml/data/tests/test_panel_data.py @@ -157,14 +157,26 @@ def test_panel_data_str(): assert "Time variable: t" in dml_str assert "Id variable: id" in dml_str assert "No. Observations:" in dml_str + assert "Static panel data:" in dml_str + + +@pytest.fixture(scope="module", params=[True, False]) +def static_panel(request): + return request.param @pytest.mark.ci -def test_panel_data_properties(): +def test_panel_data_properties(static_panel): np.random.seed(3141) df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data dml_data = DoubleMLPanelData( - data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)] + data=df, + y_col="y", + d_cols="d", + t_col="t", + id_col="id", + x_cols=[f"Z{i + 1}" for i in np.arange(4)], + static_panel=static_panel, ) assert np.array_equal(dml_data.id_var, df["id"].values) @@ -176,3 +188,10 @@ def test_panel_data_properties(): assert dml_data.n_groups == len(np.unique(df["d"].values)) assert np.array_equal(dml_data.t_values, np.sort(np.unique(df["t"].values))) assert dml_data.n_t_periods == len(np.unique(df["t"].values)) + + if static_panel: + assert dml_data.static_panel is True + assert dml_data.cluster_cols == ["id"] + else: + assert dml_data.static_panel is False + assert dml_data.cluster_cols is None diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index 283bc91b..f4ce2b83 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -4,6 +4,7 @@ from .lplr import DoubleMLLPLR from .pliv import DoubleMLPLIV +from .plpr import DoubleMLPLPR from .plr import DoubleMLPLR -__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR"] +__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR", "DoubleMLPLPR"] diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py index 6e8e9bb5..a42715bd 100644 --- a/doubleml/plm/datasets/__init__.py +++ b/doubleml/plm/datasets/__init__.py @@ -7,6 +7,7 @@ from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020 from .dgp_pliv_CHS2015 import make_pliv_CHS2015 from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021 +from .dgp_plpr_CP2025 import make_plpr_CP2025 from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 from .dgp_plr_turrell2018 import make_plr_turrell2018 @@ -18,4 +19,5 @@ "make_pliv_multiway_cluster_CKMS2021", "make_lplr_LZZ2020", "_make_pliv_data", + "make_plpr_CP2025", ] diff --git a/doubleml/plm/datasets/dgp_plpr_CP2025.py b/doubleml/plm/datasets/dgp_plpr_CP2025.py new file mode 100644 index 00000000..8e07ff48 --- /dev/null +++ b/doubleml/plm/datasets/dgp_plpr_CP2025.py @@ -0,0 +1,134 @@ +import numpy as np +import pandas as pd + + +def make_plpr_CP2025(num_id=250, num_t=10, dim_x=30, theta=0.5, dgp_type="dgp1"): + """ + Generates synthetic data for a partially linear panel regression model, based on Clarke and Polselli (2025). + The data generating process is defined as + + .. math:: + + Y_{it} &= D_{it} \\theta + l_0(X_{it}) + \\alpha_i + U_{it}, & &U_{it} \\sim \\mathcal{N}(0,1), + + D_{it} &= m_0(X_{it}) + c_i + V_{it}, & &V_{it} \\sim \\mathcal{N}(0,1), + + \\alpha_i &= 0.25 \\left(\\frac{1}{T} \\sum_{t=1}^{T} D_{it} - \\bar{D} \\right) + + 0.25 \\frac{1}{T} \\sum_{t=1}^{T} \\sum_{k \\in \\mathcal{K}} X_{it,k} + a_i + + + with :math:`a_i \\sim \\mathcal{N}(0,0.95)`, :math:`X_{it,p} \\sim \\mathcal{N}(0,5)`, :math:`c_i \\sim \\mathcal{N}(0,1)`. + Where :math:`k \\in \\mathcal{K} = \\{1,3\\}` is the number of relevant (non-zero) confounding variables, and :math:`p` is + the number of total confounding variables. + + Clarke and Polselli (2025) consider three functional forms of the confounders to model the nuisance functions :math:`l_0` + and :math:`m_0` with varying levels of non-linearity and non-smoothness: + + Design 1. (dgp1): Linear in the nuisance parameters + + .. math:: + + l_0(X_{it}) &= a X_{it,1} + X_{it,3} + + m_0(X_{it}) &= a X_{it,1} + X_{it,3} + + Design 2. (dgp2): Non-linear and smooth in the nuisance parameters + + .. math:: + + l_0(X_{it}) &= \\frac{\\exp(X_{it,1})}{1 + \\exp(X_{it,1})} + a \\cos(X_{it,3}) + + m_0(X_{it}) &= \\cos(X_{it,1}) + a \\frac{\\exp(X_{it,3})}{1 + \\exp(X_{it,3})} + + Design 3. (dgp3): Non-linear and discontinuous in the nuisance parameters + + .. math:: + + l_0(X_{it}) &= b (X_{it,1} \\cdot X_{it,3}) + a (X_{it,3} \\cdot 1\\{X_{it,3} > 0\\}) + + m_0(X_{it}) &= a (X_{it,1} \\cdot 1\\{X_{it,1} > 0\\}) + b (X_{it,1} \\cdot X_{it,3}), + + where :math:`a = 0.25`, :math:`b = 0.5`. + + Parameters + ---------- + num_id : + The number of units in the panel. + num_t : + The number of time periods in the panel. + num_x : + The number of confounding variables. + theta : + The value of the causal parameter. + dgp_type : + The type of DGP design to be used. Default is ``'dgp1'``, other options are ``'dgp2'`` and ``'dgp3'``. + + Returns + ------- + pandas.DataFrame + DataFrame containing the simulated static panel data. + + References + ---------- + Clarke, P. S. and Polselli, A. (2025), + Double machine learning for static panel models with fixed effects. The Econometrics Journal, utaf011, + doi:`10.1093/ectj/utaf011 `_. + """ + + # parameters + a = 0.25 + b = 0.5 + sigma2_a = 0.95 + sigma2_x = 5 + + # id and time vectors + id_var = np.repeat(np.arange(1, num_id + 1), num_t) + time = np.tile(np.arange(1, num_t + 1), num_id) + + # individual fixed effects + a_i = np.repeat(np.random.normal(0, np.sqrt(sigma2_a), num_id), num_t) + c_i = np.repeat(np.random.standard_normal(num_id), num_t) + + # covariates and errors + x_mean = 0 + x_it = np.random.normal(loc=x_mean, scale=np.sqrt(sigma2_x), size=(num_id * num_t, dim_x)) + u_it = np.random.standard_normal(num_id * num_t) + v_it = np.random.standard_normal(num_id * num_t) + + # functional forms in nuisance functions + if dgp_type == "dgp1": + l_0 = a * x_it[:, 0] + x_it[:, 2] + m_0 = a * x_it[:, 0] + x_it[:, 2] + elif dgp_type == "dgp2": + l_0 = np.divide(np.exp(x_it[:, 0]), 1 + np.exp(x_it[:, 0])) + a * np.cos(x_it[:, 2]) + m_0 = np.cos(x_it[:, 0]) + a * np.divide(np.exp(x_it[:, 2]), 1 + np.exp(x_it[:, 2])) + elif dgp_type == "dgp3": + l_0 = b * (x_it[:, 0] * x_it[:, 2]) + a * (x_it[:, 2] * np.where(x_it[:, 2] > 0, 1, 0)) + m_0 = a * (x_it[:, 0] * np.where(x_it[:, 0] > 0, 1, 0)) + b * (x_it[:, 0] * x_it[:, 2]) + else: + raise ValueError("Invalid dgp type.") + + # treatment + d_it = m_0 + c_i + v_it + + def alpha_i(x_it, d_it, a_i, num_n, num_t): + d_i = np.array_split(d_it, num_n) + d_i_term = np.repeat(np.mean(d_i, axis=1), num_t) - np.mean(d_it) + + x_i = np.array_split(np.sum(x_it[:, [0, 2]], axis=1), num_n) + x_i_mean = np.mean(x_i, axis=1) + x_i_term = np.repeat(x_i_mean, num_t) + + alpha_term = 0.25 * d_i_term + 0.25 * x_i_term + a_i + return alpha_term + + # outcome + y_it = d_it * theta + l_0 + alpha_i(x_it, d_it, a_i, num_id, num_t) + u_it + + x_cols = [f"x{i + 1}" for i in np.arange(dim_x)] + + data = pd.DataFrame(np.column_stack((id_var, time, y_it, d_it, x_it)), columns=["id", "time", "y", "d"] + x_cols).astype( + {"id": "int64", "time": "int64"} + ) + + return data diff --git a/doubleml/plm/plpr.py b/doubleml/plm/plpr.py new file mode 100644 index 00000000..398dd5fc --- /dev/null +++ b/doubleml/plm/plpr.py @@ -0,0 +1,507 @@ +import warnings + +import numpy as np +import pandas as pd +from sklearn.base import clone +from sklearn.utils import check_X_y + +from ..data.panel_data import DoubleMLPanelData +from ..double_ml import DoubleML +from ..double_ml_score_mixins import LinearScoreMixin +from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_is_propensity, _check_score +from ..utils._estimation import _dml_cv_predict, _dml_tune + + +class DoubleMLPLPR(LinearScoreMixin, DoubleML): + """Double machine learning for partially linear panel regression models + + Parameters + ---------- + obj_dml_data : :class:`DoubleMLPanelData` object + The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. + + ml_l : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`\\l_0(X) = E[Y|X]`. + + ml_m : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`m_0(X) = E[D|X]`. + For binary treatment variables :math:`D` (with values 0 and 1) and the CRE approaches, a classifier + implementing ``fit()`` and ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` + returns ``True``, ``predict_proba()`` is used otherwise ``predict()``. + + ml_g : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function + :math:`g_0(X) = E[Y - D \\theta_0|X]`. + Note: The learner `ml_g` is only required for the score ``'IV-type'``. Optionally, it can be specified and + estimated for callable scores. + + n_folds : int + Number of folds. + Default is ``5``. + + n_rep : int + Number of repetitons for the sample splitting. + Default is ``1``. + + score : str or callable + A str (``'partialling out'`` or ``'IV-type'``) specifying the score function + or a callable object / function with signature ``psi_a, psi_b = score(y, d, l_hat, m_hat, g_hat, smpls)``. + Default is ``'partialling out'``. + + approach : str + A str (``'cre_general'``, ``'cre_normal'``, ``'fd_exact'``, ``'wg_approx'``) specifying the type of + static panel approach in Clarke and Polselli (2025). + Default is ``'fd_exact'``. + + draw_sample_splitting : bool + Indicates whether the sample splitting should be drawn during initialization of the object. + Default is ``True``. + + Examples + -------- + >>> import numpy as np + >>> import doubleml as dml + >>> from doubleml.plm.datasets import make_plpr_CP2025 + >>> from sklearn.linear_model import LassoCV + >>> from sklearn.base import clone + >>> np.random.seed(3142) + >>> learner = LassoCV() + >>> ml_l = clone(learner) + >>> ml_m = clone(learner) + >>> data = make_plpr_CP2025(num_id=250, num_t=10, dim_x=30, theta=0.5, dgp_type='dgp1') + >>> obj_dml_data = DoubleMLPanelData(data, 'y', 'd', 'time', 'id', static_panel=True) + >>> dml_plpr_obj = DoubleMLPLPR(obj_dml_data, ml_l, ml_m) + >>> dml_plpr_obj.fit().summary + coef std err t P>|t| 2.5 % 97.5 % + d_diff 0.511626 0.024615 20.784933 5.924636e-96 0.463381 0.559871 + + Notes + ----- + **Partially linear panel regression (PLPR)** models take the form + + .. math:: + + Y_{it} &= D_{it} \\theta_0 + l_0(X_{it}) + \\alpha_i + U_{it}, & &\\mathbb{E}(U_{it} | D_{it},X_{it},\\alpha_i) = 0, + + D_{it} &= m_0(X_{it}) + \\gamma_i + V_{it}, & &\\mathbb{E}(V_{it} | X_{it},\\gamma_i) = 0, + + where :math:`Y_{it}` is the outcome variable and :math:`D_{it}` is the policy variable of interest. + The high-dimensional vector :math:`X_{it} = (X_{it,1}, \\ldots, X_{it,p})` consists of other confounding covariates, + :math:`\\alpha_i` and :math:`\\gamma_i` are the unobserved individual heterogeneity correlated with the included + covariates, and :math:`\\U_{it}` and :math:`V_{it}` are stochastic errors. + """ + + def __init__( + self, + obj_dml_data, + ml_l, + ml_m, + ml_g=None, + n_folds=5, + n_rep=1, + score="partialling out", + approach="fd_exact", + draw_sample_splitting=True, + ): + self._check_data(obj_dml_data) + self._original_dml_data = obj_dml_data + + valid_approach = ["cre_general", "cre_normal", "fd_exact", "wg_approx"] + self._check_approach(approach, valid_approach) + self._approach = approach + + # pass transformed data as DoubleMLPanelData to init + self._data_transform, self._transform_cols = self._transform_data() + obj_dml_data_transform = DoubleMLPanelData( + self._data_transform, + y_col=self._transform_cols["y_col"], + d_cols=self._transform_cols["d_cols"], + t_col=self._original_dml_data._t_col, + id_col=self._original_dml_data._id_col, + x_cols=self._transform_cols["x_cols"], + z_cols=self._original_dml_data._z_cols, + static_panel=True, + use_other_treat_as_covariate=self._original_dml_data._use_other_treat_as_covariate, + force_all_x_finite=self._original_dml_data._force_all_x_finite, + ) + super().__init__(obj_dml_data_transform, n_folds, n_rep, score, draw_sample_splitting) + + valid_scores = ["IV-type", "partialling out"] + _check_score(self.score, valid_scores, allow_callable=True) + _ = self._check_learner(ml_l, "ml_l", regressor=True, classifier=False) + ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True) + # TODO: maybe warning for binary treatment with approaches 'fd_exact' and 'wg_approx' + self._learner = {"ml_l": ml_l, "ml_m": ml_m} + + if ml_g is not None: + if (isinstance(self.score, str) & (self.score == "IV-type")) | callable(self.score): + _ = self._check_learner(ml_g, "ml_g", regressor=True, classifier=False) + self._learner["ml_g"] = ml_g + else: + assert isinstance(self.score, str) & (self.score == "partialling out") + warnings.warn( + ( + 'A learner ml_g has been provided for score = "partialling out" but will be ignored. "' + "A learner ml_g is not required for estimation." + ) + ) + elif isinstance(self.score, str) & (self.score == "IV-type"): + warnings.warn(("For score = 'IV-type', learners ml_l and ml_g should be specified. Set ml_g = clone(ml_l).")) + self._learner["ml_g"] = clone(ml_l) + + self._predict_method = {"ml_l": "predict"} + if "ml_g" in self._learner: + self._predict_method["ml_g"] = "predict" + if ml_m_is_classifier: + if self._dml_data.binary_treats.all(): + self._predict_method["ml_m"] = "predict_proba" + else: + msg = ( + f"The ml_m learner {str(ml_m)} was identified as classifier " + "but at least one treatment variable is not binary with values 0 and 1." + ) + if self._approach in ["fd_exact", "wg_approx"]: + msg += ( + " Note: In case of binary input treatment variable(s), approaches 'fd_exact' and " + "'wg_approx' tansform the treatment variable(s), such that they are no longer binary." + ) + raise ValueError(msg) + else: + self._predict_method["ml_m"] = "predict" + + self._initialize_ml_nuisance_params() + self._sensitivity_implemented = False + self._external_predictions_implemented = True + self._set_d_mean() + + def _format_score_info_str(self): + score_approach_info = f"Score function: {str(self.score)}\n" f"Static panel model approach: {str(self.approach)}" + return score_approach_info + + def _format_additional_info_str(self): + """ + Includes information on the original data before transformation. + """ + # TODO: adjust header length of additional info in double_ml.py + data_original_summary = ( + f"Cluster variable(s): {self._original_dml_data.cluster_cols}\n" + f"\nPre-Transformation Data Summary: \n" + f"Outcome variable: {self._original_dml_data.y_col}\n" + f"Treatment variable(s): {self._original_dml_data.d_cols}\n" + f"Covariates: {self._original_dml_data.x_cols}\n" + f"No. Observations: {self._original_dml_data.n_obs}\n" + ) + return data_original_summary + + @property + def approach(self): + """ + The static panel approach. + """ + return self._approach + + @property + def data_transform(self): + """ + The transformed static panel data. + """ + return self._data_transform + + @property + def transform_cols(self): + """ + The column names of the transformed static panel data. + """ + return self._transform_cols + + def _initialize_ml_nuisance_params(self): + self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner} + + def _check_data(self, obj_dml_data): + if not isinstance(obj_dml_data, DoubleMLPanelData): + raise TypeError(f"The data must be of DoubleMLPanelData type. {str(type(obj_dml_data))} was passed.") + if not obj_dml_data.static_panel: + raise ValueError( + "For the PLPR model, the DoubleMLPanelData object requires the static_panel flag to be set to True." + ) + if obj_dml_data.z_cols is not None: + raise ValueError( + "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). " + "DoubleMLPLPR currently does not support instrumental variables." + ) + return + + def _check_approach(self, approach, valid_approach): + if isinstance(approach, str): + if approach not in valid_approach: + raise ValueError("Invalid approach " + approach + ". " + "Valid approach " + " or ".join(valid_approach) + ".") + else: + raise TypeError(f"approach should be a string. {str(approach)} was passed.") + return + + def _transform_data(self): + df = self._original_dml_data.data + + y_col = self._original_dml_data.y_col + d_cols = self._original_dml_data.d_cols + x_cols = self._original_dml_data.x_cols + t_col = self._original_dml_data.t_col + id_col = self._original_dml_data.id_col + + if self._approach in ["cre_general", "cre_normal"]: + df_id_means = df[[id_col] + x_cols].groupby(id_col).transform("mean") + df_means = df_id_means.add_suffix("_mean") + data = pd.concat([df, df_means], axis=1) + cols = {"y_col": y_col, "d_cols": d_cols, "x_cols": x_cols + [f"{x}_mean" for x in x_cols]} + elif self._approach == "fd_exact": + # TODO: potential issues with unbalanced panels/missing periods, right now the + # last available is used for the lag and first difference. Maybe reindex to a complete time grid per id. + df = df.sort_values([id_col, t_col]).copy() + shifted = df[[id_col] + x_cols].groupby(id_col).shift(1).add_suffix("_lag") + first_diff = df[[id_col] + [y_col] + d_cols].groupby(id_col).diff().add_suffix("_diff") + df_fd = pd.concat([df, shifted], axis=1) + # replace original y and d columns for first-difference transformations, rename + df_fd[[y_col] + d_cols] = first_diff + cols_rename_dict = {y_col: f"{y_col}_diff"} | {col: f"{col}_diff" for col in d_cols} + df_fd = df_fd.rename(columns=cols_rename_dict) + # drop rows for first period + data = df_fd.dropna(subset=[x_cols[0] + "_lag"]).reset_index(drop=True) + cols = { + "y_col": f"{y_col}_diff", + "d_cols": [f"{d}_diff" for d in d_cols], + "x_cols": x_cols + [f"{x}_lag" for x in x_cols], + } + elif self._approach == "wg_approx": + cols_to_demean = [y_col] + d_cols + x_cols + # compute group and grand means for within means + group_means = df.groupby(id_col)[cols_to_demean].transform("mean") + grand_means = df[cols_to_demean].mean() + within_means = df[cols_to_demean] - group_means + grand_means + within_means = within_means.add_suffix("_demean") + data = pd.concat([df[[id_col, t_col]], within_means], axis=1) + cols = { + "y_col": f"{y_col}_demean", + "d_cols": [f"{d}_demean" for d in d_cols], + "x_cols": [f"{x}_demean" for x in x_cols], + } + + return data, cols + + def _set_d_mean(self): + if self._approach in ["cre_general", "cre_normal"]: + data = self._original_dml_data.data + d_cols = self._original_dml_data.d_cols + id_col = self._original_dml_data.id_col + help_d_mean = data.loc[:, [id_col] + d_cols] + d_mean = help_d_mean.groupby(id_col).transform("mean").values + self._d_mean = d_mean + else: + self._d_mean = None + + def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): + x, y = check_X_y(self._dml_data.x, self._dml_data.y, ensure_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, ensure_all_finite=False) + m_external = external_predictions["ml_m"] is not None + l_external = external_predictions["ml_l"] is not None + if "ml_g" in self._learner: + g_external = external_predictions["ml_g"] is not None + else: + g_external = False + + # nuisance l + if l_external: + l_hat = {"preds": external_predictions["ml_l"], "targets": None, "models": None} + elif self._score == "IV-type" and g_external: + l_hat = {"preds": None, "targets": None, "models": None} + else: + l_hat = _dml_cv_predict( + self._learner["ml_l"], + x, + y, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_l"), + method=self._predict_method["ml_l"], + return_models=return_models, + ) + _check_finite_predictions(l_hat["preds"], self._learner["ml_l"], "ml_l", smpls) + + # nuisance m + if m_external: + m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None} + else: + if self._approach == "cre_normal": + d_mean = self._d_mean[:, self._i_treat] + x_m = np.column_stack((x, d_mean)) + else: + x_m = x + + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x_m, + d, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + ) + + # general cre adjustment + if self._approach == "cre_general": + d_mean = self._d_mean[:, self._i_treat] + df_m_hat = pd.DataFrame({"id": self._dml_data.id_var, "m_hat": m_hat["preds"]}) + m_hat_mean = df_m_hat.groupby(["id"]).transform("mean") + m_hat_star = m_hat["preds"] + d_mean - m_hat_mean["m_hat"] + m_hat["preds"] = m_hat_star + + _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls) + if self._check_learner(self._learner["ml_m"], "ml_m", regressor=True, classifier=True): + _check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12) + + if self._dml_data.binary_treats[self._dml_data.d_cols[self._i_treat]]: + _check_binary_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", self._dml_data.d_cols[self._i_treat]) + + # an estimate of g is obtained for the IV-type score and callable scores + g_hat = {"preds": None, "targets": None, "models": None} + if "ml_g" in self._learner: + # nuisance g + if g_external: + g_hat = {"preds": external_predictions["ml_g"], "targets": None, "models": None} + else: + # get an initial estimate for theta using the partialling out score + psi_a = -np.multiply(d - m_hat["preds"], d - m_hat["preds"]) + psi_b = np.multiply(d - m_hat["preds"], y - l_hat["preds"]) + theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) + g_hat = _dml_cv_predict( + self._learner["ml_g"], + x, + y - theta_initial * d, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_g"), + method=self._predict_method["ml_g"], + return_models=return_models, + ) + _check_finite_predictions(g_hat["preds"], self._learner["ml_g"], "ml_g", smpls) + + psi_a, psi_b = self._score_elements(y, d, l_hat["preds"], m_hat["preds"], g_hat["preds"], smpls) + psi_elements = {"psi_a": psi_a, "psi_b": psi_b} + preds = { + "predictions": {"ml_l": l_hat["preds"], "ml_m": m_hat["preds"], "ml_g": g_hat["preds"]}, + "targets": {"ml_l": l_hat["targets"], "ml_m": m_hat["targets"], "ml_g": g_hat["targets"]}, + "models": {"ml_l": l_hat["models"], "ml_m": m_hat["models"], "ml_g": g_hat["models"]}, + } + + return psi_elements, preds + + def _score_elements(self, y, d, l_hat, m_hat, g_hat, smpls): + # compute residual + v_hat = d - m_hat + + if isinstance(self.score, str): + if self.score == "IV-type": + psi_a = -np.multiply(v_hat, d) + psi_b = np.multiply(v_hat, y - g_hat) + else: + assert self.score == "partialling out" + u_hat = y - l_hat + psi_a = -np.multiply(v_hat, v_hat) + psi_b = np.multiply(v_hat, u_hat) + else: + assert callable(self.score) + psi_a, psi_b = self.score(y=y, d=d, l_hat=l_hat, m_hat=m_hat, g_hat=g_hat, smpls=smpls) + + return psi_a, psi_b + + def _sensitivity_element_est(self, preds): + pass + + def _nuisance_tuning( + self, + smpls, + param_grids, + scoring_methods, + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ): + x, y = check_X_y(self._dml_data.x, self._dml_data.y, ensure_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, ensure_all_finite=False) + + if scoring_methods is None: + scoring_methods = {"ml_l": None, "ml_m": None, "ml_g": None} + + train_inds = [train_index for (train_index, _) in smpls] + l_tune_res = _dml_tune( + y, + x, + train_inds, + self._learner["ml_l"], + param_grids["ml_l"], + scoring_methods["ml_l"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + if self._approach == "cre_normal": + d_mean = self._d_mean[:, self._i_treat] + x_m = np.column_stack((x, d_mean)) + else: + x_m = x + + m_tune_res = _dml_tune( + d, + x_m, + train_inds, + self._learner["ml_m"], + param_grids["ml_m"], + scoring_methods["ml_m"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + l_best_params = [xx.best_params_ for xx in l_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] + + # an ML model for g is obtained for the IV-type score and callable scores + if "ml_g" in self._learner: + # construct an initial theta estimate from the tuned models using the partialling out score + l_hat = np.full_like(y, np.nan) + m_hat = np.full_like(d, np.nan) + for idx, (train_index, _) in enumerate(smpls): + l_hat[train_index] = l_tune_res[idx].predict(x[train_index, :]) + m_hat[train_index] = m_tune_res[idx].predict(x_m[train_index, :]) + psi_a = -np.multiply(d - m_hat, d - m_hat) + psi_b = np.multiply(d - m_hat, y - l_hat) + theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) + g_tune_res = _dml_tune( + y - theta_initial * d, + x, + train_inds, + self._learner["ml_g"], + param_grids["ml_g"], + scoring_methods["ml_g"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + g_best_params = [xx.best_params_ for xx in g_tune_res] + params = {"ml_l": l_best_params, "ml_m": m_best_params, "ml_g": g_best_params} + tune_res = {"l_tune": l_tune_res, "m_tune": m_tune_res, "g_tune": g_tune_res} + else: + params = {"ml_l": l_best_params, "ml_m": m_best_params} + tune_res = {"l_tune": l_tune_res, "m_tune": m_tune_res} + + res = {"params": params, "tune_res": tune_res} + + return res diff --git a/doubleml/plm/tests/test_datasets.py b/doubleml/plm/tests/test_datasets.py index 5e16b9ac..0762ab47 100644 --- a/doubleml/plm/tests/test_datasets.py +++ b/doubleml/plm/tests/test_datasets.py @@ -9,12 +9,20 @@ make_lplr_LZZ2020, make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, + make_plpr_CP2025, make_plr_CCDDHNR2018, make_plr_turrell2018, ) msg_inv_return_type = "Invalid return_type." +msg_inv_dgp_type = "Invalid dgp type." + + +@pytest.fixture(scope="module", params=["dgp1", "dgp2", "dgp3"]) +def dgp_type(request): + return request.param + @pytest.mark.ci def test_make_plr_CCDDHNR2018_return_types(): @@ -149,3 +157,16 @@ def test_make_lplr_LZZ2020_variants(): res = make_lplr_LZZ2020(n_obs=100, balanced_r0=False) _, y_unique = np.unique(res.y, return_counts=True) assert np.abs(y_unique[0] - y_unique[1]) > 10 + + +@pytest.mark.ci +def test_make_plpr_CP2025_return_types(dgp_type): + np.random.seed(3141) + res = make_plpr_CP2025(num_id=100, dgp_type=dgp_type) + assert isinstance(res, pd.DataFrame) + + +@pytest.mark.ci +def test_make_plpr_CP2025_invalid_dgp_type(): + with pytest.raises(ValueError, match=msg_inv_dgp_type): + _ = make_plpr_CP2025(num_id=100, dgp_type="dgp4") diff --git a/doubleml/plm/tests/test_model_defaults.py b/doubleml/plm/tests/test_model_defaults.py index b555f5ad..bc9e8a50 100644 --- a/doubleml/plm/tests/test_model_defaults.py +++ b/doubleml/plm/tests/test_model_defaults.py @@ -1,14 +1,28 @@ +import numpy as np import pytest from sklearn.linear_model import LinearRegression, LogisticRegression -from doubleml import DoubleMLLPLR -from doubleml.plm.datasets import make_lplr_LZZ2020 +from doubleml import DoubleMLLPLR, DoubleMLPanelData, DoubleMLPLPR +from doubleml.double_ml import DoubleML +from doubleml.plm.datasets import make_lplr_LZZ2020, make_plpr_CP2025 from doubleml.utils._check_defaults import _check_basic_defaults_after_fit, _check_basic_defaults_before_fit, _fit_bootstrap dml_data_lplr = make_lplr_LZZ2020(n_obs=100) dml_lplr_obj = DoubleMLLPLR(dml_data_lplr, LogisticRegression(), LinearRegression(), LinearRegression()) +plpr_data = make_plpr_CP2025(num_id=100, dgp_type="dgp1") +dml_data_plpr = DoubleMLPanelData( + plpr_data, + y_col="y", + d_cols="d", + t_col="time", + id_col="id", + static_panel=True, +) + +dml_plpr_obj = DoubleMLPLPR(dml_data_plpr, LinearRegression(), LinearRegression()) + @pytest.mark.ci def test_lplr_defaults(): @@ -17,3 +31,37 @@ def test_lplr_defaults(): _fit_bootstrap(dml_lplr_obj) _check_basic_defaults_after_fit(dml_lplr_obj) + + +@pytest.mark.ci +def test_plpr_defaults(): + _check_basic_defaults_before_fit(dml_plpr_obj) + + # manual fit and default check after fit + dml_plpr_obj.fit() + assert dml_plpr_obj.n_folds == 5 + assert dml_plpr_obj.n_rep == 1 + assert dml_plpr_obj.framework is not None + + # coefs and se + assert isinstance(dml_plpr_obj.coef, np.ndarray) + assert isinstance(dml_plpr_obj.se, np.ndarray) + assert isinstance(dml_plpr_obj.all_coef, np.ndarray) + assert isinstance(dml_plpr_obj.all_se, np.ndarray) + assert isinstance(dml_plpr_obj.t_stat, np.ndarray) + assert isinstance(dml_plpr_obj.pval, np.ndarray) + + # bootstrap and p_adjust method skipped + + # sensitivity + assert dml_plpr_obj.sensitivity_params is None + if dml_plpr_obj.sensitivity_params is not None: + assert isinstance(dml_plpr_obj.sensitivity_elements, dict) + + # fit method + if isinstance(dml_plpr_obj, DoubleML): + assert dml_plpr_obj.predictions is not None + assert dml_plpr_obj.models is None + + # confint method + assert dml_plpr_obj.confint().equals(dml_plpr_obj.confint(joint=False, level=0.95)) diff --git a/doubleml/plm/tests/test_plpr.py b/doubleml/plm/tests/test_plpr.py new file mode 100644 index 00000000..d072ebdf --- /dev/null +++ b/doubleml/plm/tests/test_plpr.py @@ -0,0 +1,76 @@ +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.linear_model import Lasso, LinearRegression + +import doubleml as dml + +from ..datasets import make_plpr_CP2025 + + +@pytest.fixture(scope="module", params=[LinearRegression(), Lasso(alpha=0.1)]) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["IV-type", "partialling out"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=["cre_general", "cre_normal", "fd_exact", "wg_approx"]) +def approach(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_plpr_fixture( + learner, + score, + approach, +): + n_folds = 5 + theta = 0.5 + + ml_l = clone(learner) + ml_m = clone(learner) + ml_g = clone(learner) + + np.random.seed(3141) + plpr_data = make_plpr_CP2025(theta=theta) + obj_dml_data = dml.DoubleMLPanelData( + plpr_data, + y_col="y", + d_cols="d", + t_col="time", + id_col="id", + static_panel=True, + ) + dml_plpr_obj = dml.DoubleMLPLPR( + obj_dml_data, + ml_l, + ml_m, + ml_g, + n_folds=n_folds, + score=score, + approach=approach, + ) + + dml_plpr_obj.fit() + + res_dict = { + "coef": dml_plpr_obj.coef[0], + "se": dml_plpr_obj.se[0], + "true_coef": theta, + } + + return res_dict + + +@pytest.mark.ci +def test_dml_plpr_coef(dml_plpr_fixture): + # true_coef should lie within three standard deviations of the estimate + coef = dml_plpr_fixture["coef"] + se = dml_plpr_fixture["se"] + true_coef = dml_plpr_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * se diff --git a/doubleml/plm/tests/test_plpr_exceptions.py b/doubleml/plm/tests/test_plpr_exceptions.py new file mode 100644 index 00000000..f11811d2 --- /dev/null +++ b/doubleml/plm/tests/test_plpr_exceptions.py @@ -0,0 +1,276 @@ +import copy + +import numpy as np +import pandas as pd +import pytest +from sklearn.base import BaseEstimator +from sklearn.linear_model import Lasso, LogisticRegression + +from doubleml import DoubleMLPanelData, DoubleMLPLPR +from doubleml.plm.datasets import make_plpr_CP2025 + +np.random.seed(3141) +num_id = 100 +# create test data and basic learners +plpr_data = make_plpr_CP2025(num_id=num_id, theta=0.5, dim_x=30) +plpr_data_binary = plpr_data.copy() +plpr_data_binary["d"] = np.where(plpr_data_binary["d"] > 0, 1, 0) + +x_cols = [col for col in plpr_data.columns if "x" in col] +dml_data = DoubleMLPanelData( + plpr_data, + y_col="y", + d_cols="d", + t_col="time", + id_col="id", + static_panel=True, +) +dml_data_iv = DoubleMLPanelData( + plpr_data, + y_col="y", + d_cols="d", + t_col="time", + id_col="id", + x_cols=x_cols[:-1], + z_cols=x_cols[-1], + static_panel=True, +) +dml_data_binary = DoubleMLPanelData( + plpr_data_binary, + y_col="y", + d_cols="d", + t_col="time", + id_col="id", + static_panel=True, +) +ml_l = Lasso(alpha=0.1) +ml_m = Lasso(alpha=0.1) +ml_g = Lasso(alpha=0.1) +dml_plpr = DoubleMLPLPR(dml_data, ml_l, ml_m) +dml_plpr_iv_type = DoubleMLPLPR(dml_data, ml_l, ml_m, ml_g, score="IV-type") + + +@pytest.mark.ci +def test_plpr_exception_data(): + msg = "The data must be of DoubleMLPanelData type. was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLPLPR(pd.DataFrame(), ml_l, ml_m) + + # instrument + msg = ( + r"Incompatible data. x30 have been set as instrumental variable\(s\). " + "DoubleMLPLPR currently does not support instrumental variables." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPLPR(dml_data_iv, ml_l, ml_m) + + +@pytest.mark.ci +def test_plpr_exception_scores(): + msg = "Invalid score IV. Valid score IV-type or partialling out." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l, ml_m, score="IV") + msg = "score should be either a string or a callable. 0 was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l, ml_m, score=0) + + +@pytest.mark.ci +def test_plpr_exception_approach(): + # PLPR valid approaches are 'cre_general', 'cre_normal', 'fd_exact', and 'wg_approx' + msg = "Invalid approach cre. Valid approach cre_general or cre_normal or fd_exact or wg_approx." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l, ml_m, approach="cre") + msg = "approach should be a string. 4 was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l, ml_m, approach=4) + + +@pytest.mark.ci +def test_plpr_exception_resampling(): + msg = "The number of folds must be of int type. 1.5 of type was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l, ml_m, n_folds=1.5) + msg = "The number of repetitions for the sample splitting must be of int type. 1.5 of type was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l, ml_m, n_rep=1.5) + msg = "The number of folds must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l, ml_m, n_folds=0) + msg = "The number of repetitions for the sample splitting must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l, ml_m, n_rep=0) + msg = "draw_sample_splitting must be True or False. Got true." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l, ml_m, draw_sample_splitting="true") + + +@pytest.mark.ci +def test_plpr_exception_get_params(): + msg = "Invalid nuisance learner ml_r. Valid nuisance learner ml_l or ml_m." + with pytest.raises(ValueError, match=msg): + dml_plpr.get_params("ml_r") + msg = "Invalid nuisance learner ml_g. Valid nuisance learner ml_l or ml_m." + with pytest.raises(ValueError, match=msg): + dml_plpr.get_params("ml_g") + msg = "Invalid nuisance learner ml_r. Valid nuisance learner ml_l or ml_m or ml_g." + with pytest.raises(ValueError, match=msg): + dml_plpr_iv_type.get_params("ml_r") + + +# TODO: test_doubleml_exception_onefold(): for plpr? +@pytest.mark.ci +def test_plpr_exception_smpls(): + msg = ( + "Sample splitting not specified. " + r"Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\)." + ) + dml_plpr_no_smpls = DoubleMLPLPR(dml_data, ml_l, ml_m, draw_sample_splitting=False) + with pytest.raises(ValueError, match=msg): + _ = dml_plpr_no_smpls.smpls + + dml_plpr_cluster = dml_plpr + smpls = dml_plpr.smpls + msg = "For cluster data, all_smpls_cluster must be provided." + with pytest.raises(ValueError, match=msg): + _ = dml_plpr_cluster.set_sample_splitting(smpls) + + all_smpls_cluster = copy.deepcopy(dml_plpr_cluster.smpls_cluster) + all_smpls_cluster.append(all_smpls_cluster[0]) + msg = "Invalid samples provided. Number of repetitions for all_smpls and all_smpls_cluster must be the same." + with pytest.raises(ValueError, match=msg): + _ = dml_plpr_cluster.set_sample_splitting(all_smpls=dml_plpr_cluster.smpls, all_smpls_cluster=all_smpls_cluster) + + all_smpls_cluster = copy.deepcopy(dml_plpr_cluster.smpls_cluster) + all_smpls_cluster.append(all_smpls_cluster[0]) + msg = "Invalid samples provided. Number of repetitions for all_smpls and all_smpls_cluster must be the same." + with pytest.raises(ValueError, match=msg): + _ = dml_plpr_cluster.set_sample_splitting(all_smpls=dml_plpr_cluster.smpls, all_smpls_cluster=all_smpls_cluster) + + all_smpls_cluster = copy.deepcopy(dml_plpr_cluster.smpls_cluster) + all_smpls_cluster[0][0][1][0] = np.append(all_smpls_cluster[0][0][1][0], [11], axis=0) + msg = "Invalid cluster partition provided. At least one inner list does not form a partition." + with pytest.raises(ValueError, match=msg): + _ = dml_plpr_cluster.set_sample_splitting(all_smpls=dml_plpr_cluster.smpls, all_smpls_cluster=all_smpls_cluster) + + all_smpls_cluster = copy.deepcopy(dml_plpr_cluster.smpls_cluster) + all_smpls_cluster[0][0][1][0][1] = 11 + msg = "Invalid cluster partition provided. At least one inner list does not form a partition." + with pytest.raises(ValueError, match=msg): + _ = dml_plpr_cluster.set_sample_splitting(all_smpls=dml_plpr_cluster.smpls, all_smpls_cluster=all_smpls_cluster) + + +@pytest.mark.ci +def test_plpr_exception_fit(): + msg = "The number of CPUs used to fit the learners must be of int type. 5 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_plpr.fit(n_jobs_cv="5") + msg = "store_predictions must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_plpr.fit(store_predictions=1) + msg = "store_models must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_plpr.fit(store_models=1) + + +@pytest.mark.ci +def test_plpr_exception_set_ml_nuisance_params(): + msg = "Invalid nuisance learner g. Valid nuisance learner ml_l or ml_m." + with pytest.raises(ValueError, match=msg): + dml_plpr.set_ml_nuisance_params("g", "d", {"alpha": 0.1}) + msg = "Invalid treatment variable y. Valid treatment variable d_diff." + with pytest.raises(ValueError, match=msg): + dml_plpr.set_ml_nuisance_params("ml_l", "y", {"alpha": 0.1}) + + +class _DummyNoSetParams: + def fit(self): + pass + + +class _DummyNoGetParams(_DummyNoSetParams): + def set_params(self): + pass + + +class _DummyNoClassifier(_DummyNoGetParams, BaseEstimator): + def get_params(self, deep=True): + return {} + + def predict_proba(self): + pass + + +class LogisticRegressionManipulatedPredict(LogisticRegression): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.estimator_type = None + return tags + + def predict(self, X): + if self.max_iter == 314: + preds = super().predict_proba(X)[:, 1] + else: + preds = super().predict(X) + return preds + + +@pytest.mark.ci +def test_plpr_exception_learner(): + err_msg_prefix = "Invalid learner provided for ml_l: " + + msg = err_msg_prefix + "provide an instance of a learner instead of a class." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLPLPR(dml_data, Lasso, ml_m) + msg = err_msg_prefix + r"BaseEstimator\(\) has no method .fit\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLPLPR(dml_data, BaseEstimator(), ml_m) + # msg = err_msg_prefix + r'_DummyNoSetParams\(\) has no method .set_params\(\).' + with pytest.raises(TypeError): + _ = DoubleMLPLPR(dml_data, _DummyNoSetParams(), ml_m) + # msg = err_msg_prefix + r'_DummyNoSetParams\(\) has no method .get_params\(\).' + with pytest.raises(TypeError): + _ = DoubleMLPLPR(dml_data, _DummyNoGetParams(), ml_m) + + # we allow classifiers for ml_m in PLPR, but only for binary treatment variables + msg = ( + r"The ml_m learner LogisticRegression\(\) was identified as classifier " + "but at least one treatment variable is not binary with values 0 and 1." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPLPR(dml_data, Lasso(), LogisticRegression()) + + msg = r"For score = 'IV-type', learners ml_l and ml_g should be specified. Set ml_g = clone\(ml_l\)." + with pytest.warns(UserWarning, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l=Lasso(), ml_m=ml_m, score="IV-type") + + msg = 'A learner ml_g has been provided for score = "partialling out" but will be ignored.' + with pytest.warns(UserWarning, match=msg): + _ = DoubleMLPLPR(dml_data, ml_l=Lasso(), ml_m=Lasso(), ml_g=Lasso(), score="partialling out") + + # construct a classifier which is not identifiable as classifier via is_classifier by sklearn + # it then predicts labels and therefore an exception will be thrown + # TODO: cases with approaches cre_general, fd_exact, wg_approx + log_reg = LogisticRegressionManipulatedPredict() + msg_warn = ( + r"Learner provided for ml_m is probably invalid: LogisticRegressionManipulatedPredict\(\) is \(probably\) " + "neither a regressor nor a classifier. Method predict is used for prediction." + ) + with pytest.warns(UserWarning, match=msg_warn): + dml_plpr_hidden_classifier = DoubleMLPLPR(dml_data_binary, Lasso(), log_reg, approach="cre_normal") + msg = ( + r"For the binary variable d, predictions obtained with the ml_m learner LogisticRegressionManipulatedPredict\(\) " + "are also observed to be binary with values 0 and 1. Make sure that for classifiers probabilities and not " + "labels are predicted." + ) + with pytest.warns(UserWarning, match=msg_warn): + with pytest.raises(ValueError, match=msg): + dml_plpr_hidden_classifier.fit() + + +@pytest.mark.ci +@pytest.mark.filterwarnings("ignore:Learner provided for") +def test_plpr_exception_and_warning_learner(): + # msg = err_msg_prefix + r'_DummyNoClassifier\(\) has no method .predict\(\).' + with pytest.raises(TypeError): + _ = DoubleMLPLPR(dml_data, _DummyNoClassifier(), Lasso()) diff --git a/doubleml/plm/tests/test_plpr_external_predictions.py b/doubleml/plm/tests/test_plpr_external_predictions.py new file mode 100644 index 00000000..cd43cec1 --- /dev/null +++ b/doubleml/plm/tests/test_plpr_external_predictions.py @@ -0,0 +1,106 @@ +import math + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression + +from doubleml import DoubleMLPanelData, DoubleMLPLPR +from doubleml.plm.datasets import make_plpr_CP2025 +from doubleml.utils import DMLDummyRegressor + +treat_label = {"cre_general": "d", "cre_normal": "d", "fd_exact": "d_diff", "wg_approx": "d_demean"} + + +@pytest.fixture(scope="module", params=["IV-type", "partialling out"]) +def plpr_score(request): + return request.param + + +@pytest.fixture(scope="module", params=["cre_general", "cre_normal", "fd_exact", "wg_approx"]) +def plpr_approach(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_m_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_l_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_g_ext(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_plpr_fixture(plpr_score, plpr_approach, n_rep, set_ml_m_ext, set_ml_l_ext, set_ml_g_ext): + ext_predictions = {treat_label[plpr_approach]: {}} + + plpr_data = make_plpr_CP2025(num_id=100, theta=0.5, dgp_type="dgp1") + + np.random.seed(3141) + dml_data_plpr = DoubleMLPanelData( + plpr_data, + y_col="y", + d_cols="d", + t_col="time", + id_col="id", + static_panel=True, + ) + + kwargs = {"obj_dml_data": dml_data_plpr, "score": plpr_score, "approach": plpr_approach, "n_rep": n_rep} + + if plpr_score == "IV-type": + kwargs["ml_g"] = LinearRegression() + + dml_plpr = DoubleMLPLPR(ml_m=LinearRegression(), ml_l=LinearRegression(), **kwargs) + + np.random.seed(3141) + dml_plpr.fit(store_predictions=True) + + if set_ml_m_ext: + ext_predictions[treat_label[plpr_approach]]["ml_m"] = dml_plpr.predictions["ml_m"][:, :, 0] + ml_m = DMLDummyRegressor() + else: + ml_m = LinearRegression() + + if set_ml_l_ext: + ext_predictions[treat_label[plpr_approach]]["ml_l"] = dml_plpr.predictions["ml_l"][:, :, 0] + ml_l = DMLDummyRegressor() + else: + ml_l = LinearRegression() + + if plpr_score == "IV-type" and set_ml_g_ext: + ext_predictions[treat_label[plpr_approach]]["ml_g"] = dml_plpr.predictions["ml_g"][:, :, 0] + kwargs["ml_g"] = DMLDummyRegressor() + elif plpr_score == "IV-type" and not set_ml_g_ext: + kwargs["ml_g"] = LinearRegression() + else: + pass + + if plpr_score == "IV-type" and set_ml_g_ext and not set_ml_l_ext: + ml_l = DMLDummyRegressor() + + # special case if ml_l is not needed + dml_plpr_ext = DoubleMLPLPR(ml_m=ml_m, ml_l=ml_l, **kwargs) + + np.random.seed(3141) + dml_plpr_ext.fit(external_predictions=ext_predictions) + + res_dict = {"coef_normal": dml_plpr.coef[0], "coef_ext": dml_plpr_ext.coef[0]} + + return res_dict + + +@pytest.mark.ci +def test_doubleml_plpr_coef(doubleml_plpr_fixture): + assert math.isclose(doubleml_plpr_fixture["coef_normal"], doubleml_plpr_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-4) diff --git a/doubleml/plm/tests/test_plpr_tune.py b/doubleml/plm/tests/test_plpr_tune.py new file mode 100644 index 00000000..e1dcef2d --- /dev/null +++ b/doubleml/plm/tests/test_plpr_tune.py @@ -0,0 +1,104 @@ +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.linear_model import Lasso + +import doubleml as dml + +from ..datasets import make_plpr_CP2025 + + +@pytest.fixture(scope="module", params=[Lasso()]) +def learner_l(request): + return request.param + + +@pytest.fixture(scope="module", params=[Lasso()]) +def learner_m(request): + return request.param + + +@pytest.fixture(scope="module", params=[Lasso()]) +def learner_g(request): + return request.param + + +@pytest.fixture(scope="module", params=["partialling out", "IV-type"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=["cre_general", "cre_normal", "fd_exact", "wg_approx"]) +def approach(request): + return request.param + + +def get_par_grid(): + par_grid = {"alpha": np.linspace(0.05, 0.95, 7)} + return par_grid + + +@pytest.fixture(scope="module") +def dml_plpr_fixture( + learner_l, + learner_m, + learner_g, + score, + approach, + tune_on_folds=False, +): + par_grid = { + "ml_l": get_par_grid(), + "ml_m": get_par_grid(), + "ml_g": get_par_grid(), + } + n_folds_tune = 4 + n_folds = 5 + theta = 0.5 + + ml_l = clone(learner_l) + ml_m = clone(learner_m) + ml_g = clone(learner_g) + + np.random.seed(3141) + plpr_data = make_plpr_CP2025(theta=theta) + obj_dml_data = dml.DoubleMLPanelData( + plpr_data, + y_col="y", + d_cols="d", + t_col="time", + id_col="id", + static_panel=True, + ) + dml_sel_obj = dml.DoubleMLPLPR( + obj_dml_data, + ml_l, + ml_m, + ml_g, + n_folds=n_folds, + score=score, + approach=approach, + ) + + # tune hyperparameters + tune_res = dml_sel_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False) + assert isinstance(tune_res, dml.DoubleMLPLPR) + + dml_sel_obj.fit() + + res_dict = { + "coef": dml_sel_obj.coef[0], + "se": dml_sel_obj.se[0], + "true_coef": theta, + } + + return res_dict + + +@pytest.mark.ci +def test_dml_plpr_coef(dml_plpr_fixture): + # true_coef should lie within three standard deviations of the estimate + coef = dml_plpr_fixture["coef"] + se = dml_plpr_fixture["se"] + true_coef = dml_plpr_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * se diff --git a/doubleml/plm/tests/test_return_types.py b/doubleml/plm/tests/test_return_types.py index cb32f543..62075fc3 100644 --- a/doubleml/plm/tests/test_return_types.py +++ b/doubleml/plm/tests/test_return_types.py @@ -41,6 +41,7 @@ (dml_lplr_obj, DoubleMLLPLR), (dml_lplr_obj_binary, DoubleMLLPLR), ] +# TODO: plpr with cluster data return type tests? n_obs is changed for fd_exact approach @pytest.mark.ci