Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
fe6375e
static panel model, data, and util
JulianDiefenbacher Jun 23, 2025
2031923
update plpr and dataset
JulianDiefenbacher Jun 26, 2025
5f6a374
add basic simulations
JulianDiefenbacher Jun 26, 2025
215a5e5
add plpr __str__, and checks
JulianDiefenbacher Jul 10, 2025
4189671
update example_sim
JulianDiefenbacher Jul 10, 2025
2c7e238
add model descriptions
JulianDiefenbacher Oct 9, 2025
7708f9b
fix typo
JulianDiefenbacher Oct 9, 2025
d5bf9e9
fix notation consistency
JulianDiefenbacher Oct 9, 2025
d8c3039
update description numbering
JulianDiefenbacher Oct 9, 2025
375325f
Merge branch 'main' into j-static-panel
JulianDiefenbacher Oct 27, 2025
753f68a
update from ClusterData to base Data class
JulianDiefenbacher Oct 27, 2025
058da4e
add static_panel flag in PanelData
JulianDiefenbacher Oct 27, 2025
85da2eb
add static_panel property
JulianDiefenbacher Nov 4, 2025
08cfd8c
add static_panel property and update tests for panel data handling
SvenKlaassen Nov 4, 2025
1ecab66
update plpr model, include data transformation
JulianDiefenbacher Nov 6, 2025
353e148
Merge branch 'main' into j-static-panel
JulianDiefenbacher Nov 6, 2025
5dc1a44
refactor: simplify string representation and add additional info meth…
SvenKlaassen Nov 7, 2025
fdc4330
correct score info string newline spacing
JulianDiefenbacher Nov 10, 2025
bccc81c
data transform update, add transform_col_names property
JulianDiefenbacher Nov 10, 2025
eb68557
add transformed data arrays for nuisance estimation
JulianDiefenbacher Nov 10, 2025
c404f06
add _initialize_fd_model because of n_obs and smpls issue
JulianDiefenbacher Nov 11, 2025
3ccfb34
clearer TODO description
JulianDiefenbacher Nov 11, 2025
ab9f83f
move data transformation before init
JulianDiefenbacher Nov 13, 2025
f27ebeb
update logic for cre_normal approach in estimation and tuning
JulianDiefenbacher Nov 13, 2025
011f8cb
add simulation replication
JulianDiefenbacher Nov 24, 2025
3f8eef6
update PLPR model
JulianDiefenbacher Nov 24, 2025
677c5a6
allow binary treatment for PLPR
JulianDiefenbacher Nov 24, 2025
7c7bd43
update plpr dgp function
JulianDiefenbacher Nov 27, 2025
ecce2b5
update make_plpr use
JulianDiefenbacher Nov 27, 2025
122e97d
Merge branch 'main' into j-static-panel
JulianDiefenbacher Nov 27, 2025
453cbf6
add basic plpr tests
JulianDiefenbacher Nov 30, 2025
86c947a
remove notebooks
JulianDiefenbacher Dec 1, 2025
7d01cd1
fix id var issue
JulianDiefenbacher Dec 1, 2025
7c24a99
complete plpr dataset tests
JulianDiefenbacher Dec 2, 2025
fb56a27
fix formatting
JulianDiefenbacher Dec 2, 2025
b15571c
add external pred test, complete model default test
JulianDiefenbacher Dec 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doubleml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .irm.ssm import DoubleMLSSM
from .plm.lplr import DoubleMLLPLR
from .plm.pliv import DoubleMLPLIV
from .plm.plpr import DoubleMLPLPR
from .plm.plr import DoubleMLPLR
from .utils.blp import DoubleMLBLP
from .utils.policytree import DoubleMLPolicyTree
Expand Down Expand Up @@ -44,6 +45,7 @@
"DoubleMLPolicyTree",
"DoubleMLSSM",
"DoubleMLLPLR",
"DoubleMLPLPR",
]

__version__ = importlib.metadata.version("doubleml")
36 changes: 29 additions & 7 deletions doubleml/data/panel_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ class DoubleMLPanelData(DoubleMLData):
The instrumental variable(s).
Default is ``None``.

static_panel : bool
Indicates whether the data model corresponds to a static
panel data approach (``True``) or to staggered adoption panel data
(``False``). In the latter case, the treatment groups/values are defined in terms of the first time of
treatment exposure.
Default is ``False``.

use_other_treat_as_covariate : bool
Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
Default is ``True``.
Expand Down Expand Up @@ -82,31 +89,40 @@ def __init__(
id_col,
x_cols=None,
z_cols=None,
static_panel=False,
use_other_treat_as_covariate=True,
force_all_x_finite=True,
datetime_unit="M",
):
DoubleMLBaseData.__init__(self, data)

self._static_panel = static_panel

# we need to set id_col (needs _data) before call to the super __init__ because of the x_cols setter
self.id_col = id_col
self._datetime_unit = _is_valid_datetime_unit(datetime_unit)
self._set_id_var()

# Set time column before calling parent constructor
self.t_col = t_col
self._datetime_unit = _is_valid_datetime_unit(datetime_unit)

if not self.static_panel:
cluster_cols = None
force_all_d_finite = False
else:
cluster_cols = id_col
force_all_d_finite = True

# Call parent constructor
DoubleMLData.__init__(
self,
data=data,
y_col=y_col,
d_cols=d_cols,
x_cols=x_cols,
z_cols=z_cols,
cluster_cols=cluster_cols,
use_other_treat_as_covariate=use_other_treat_as_covariate,
force_all_x_finite=force_all_x_finite,
force_all_d_finite=False,
force_all_d_finite=force_all_d_finite,
)

# reset index to ensure a simple RangeIndex
Expand All @@ -115,15 +131,15 @@ def __init__(
# Set time variable array after data is loaded
self._set_time_var()

if self.n_treat != 1:
raise ValueError("Only one treatment column is allowed for panel data.")

self._check_disjoint_sets_id_col()

# intialize the unique values of g and t
self._g_values = np.sort(np.unique(self.d)) # unique values of g
self._t_values = np.sort(np.unique(self.t)) # unique values of t

if self.n_treat != 1:
raise ValueError("Only one treatment column is allowed for panel data.")

def __str__(self):
data_summary = self._data_summary_str()
buf = io.StringIO()
Expand All @@ -146,6 +162,7 @@ def _data_summary_str(self):
f"Instrument variable(s): {self.z_cols}\n"
f"Time variable: {self.t_col}\n"
f"Id variable: {self.id_col}\n"
f"Static panel data: {self.static_panel}\n"
)

data_summary += f"No. Unique Ids: {self.n_ids}\n" f"No. Observations: {self.n_obs}\n"
Expand Down Expand Up @@ -296,6 +313,11 @@ def n_t_periods(self):
"""
return len(self.t_values)

@property
def static_panel(self):
"""Indicates whether the data model corresponds to a static panel data approach."""
return self._static_panel

def _get_optional_col_sets(self):
base_optional_col_sets = super()._get_optional_col_sets()
id_col_set = {self.id_col}
Expand Down
23 changes: 21 additions & 2 deletions doubleml/data/tests/test_panel_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,14 +157,26 @@ def test_panel_data_str():
assert "Time variable: t" in dml_str
assert "Id variable: id" in dml_str
assert "No. Observations:" in dml_str
assert "Static panel data:" in dml_str


@pytest.fixture(scope="module", params=[True, False])
def static_panel(request):
return request.param


@pytest.mark.ci
def test_panel_data_properties():
def test_panel_data_properties(static_panel):
np.random.seed(3141)
df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data
dml_data = DoubleMLPanelData(
data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)]
data=df,
y_col="y",
d_cols="d",
t_col="t",
id_col="id",
x_cols=[f"Z{i + 1}" for i in np.arange(4)],
static_panel=static_panel,
)

assert np.array_equal(dml_data.id_var, df["id"].values)
Expand All @@ -176,3 +188,10 @@ def test_panel_data_properties():
assert dml_data.n_groups == len(np.unique(df["d"].values))
assert np.array_equal(dml_data.t_values, np.sort(np.unique(df["t"].values)))
assert dml_data.n_t_periods == len(np.unique(df["t"].values))

if static_panel:
assert dml_data.static_panel is True
assert dml_data.cluster_cols == ["id"]
else:
assert dml_data.static_panel is False
assert dml_data.cluster_cols is None
3 changes: 2 additions & 1 deletion doubleml/plm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from .lplr import DoubleMLLPLR
from .pliv import DoubleMLPLIV
from .plpr import DoubleMLPLPR
from .plr import DoubleMLPLR

__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR"]
__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR", "DoubleMLPLPR"]
2 changes: 2 additions & 0 deletions doubleml/plm/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020
from .dgp_pliv_CHS2015 import make_pliv_CHS2015
from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021
from .dgp_plpr_CP2025 import make_plpr_CP2025
from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018
from .dgp_plr_turrell2018 import make_plr_turrell2018

Expand All @@ -18,4 +19,5 @@
"make_pliv_multiway_cluster_CKMS2021",
"make_lplr_LZZ2020",
"_make_pliv_data",
"make_plpr_CP2025",
]
134 changes: 134 additions & 0 deletions doubleml/plm/datasets/dgp_plpr_CP2025.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import numpy as np
import pandas as pd


def make_plpr_CP2025(num_id=250, num_t=10, dim_x=30, theta=0.5, dgp_type="dgp1"):
"""
Generates synthetic data for a partially linear panel regression model, based on Clarke and Polselli (2025).
The data generating process is defined as

.. math::

Y_{it} &= D_{it} \\theta + l_0(X_{it}) + \\alpha_i + U_{it}, & &U_{it} \\sim \\mathcal{N}(0,1),

D_{it} &= m_0(X_{it}) + c_i + V_{it}, & &V_{it} \\sim \\mathcal{N}(0,1),

\\alpha_i &= 0.25 \\left(\\frac{1}{T} \\sum_{t=1}^{T} D_{it} - \\bar{D} \\right)
+ 0.25 \\frac{1}{T} \\sum_{t=1}^{T} \\sum_{k \\in \\mathcal{K}} X_{it,k} + a_i


with :math:`a_i \\sim \\mathcal{N}(0,0.95)`, :math:`X_{it,p} \\sim \\mathcal{N}(0,5)`, :math:`c_i \\sim \\mathcal{N}(0,1)`.
Where :math:`k \\in \\mathcal{K} = \\{1,3\\}` is the number of relevant (non-zero) confounding variables, and :math:`p` is
the number of total confounding variables.

Clarke and Polselli (2025) consider three functional forms of the confounders to model the nuisance functions :math:`l_0`
and :math:`m_0` with varying levels of non-linearity and non-smoothness:

Design 1. (dgp1): Linear in the nuisance parameters

.. math::

l_0(X_{it}) &= a X_{it,1} + X_{it,3}

m_0(X_{it}) &= a X_{it,1} + X_{it,3}

Design 2. (dgp2): Non-linear and smooth in the nuisance parameters

.. math::

l_0(X_{it}) &= \\frac{\\exp(X_{it,1})}{1 + \\exp(X_{it,1})} + a \\cos(X_{it,3})

m_0(X_{it}) &= \\cos(X_{it,1}) + a \\frac{\\exp(X_{it,3})}{1 + \\exp(X_{it,3})}

Design 3. (dgp3): Non-linear and discontinuous in the nuisance parameters

.. math::

l_0(X_{it}) &= b (X_{it,1} \\cdot X_{it,3}) + a (X_{it,3} \\cdot 1\\{X_{it,3} > 0\\})

m_0(X_{it}) &= a (X_{it,1} \\cdot 1\\{X_{it,1} > 0\\}) + b (X_{it,1} \\cdot X_{it,3}),

where :math:`a = 0.25`, :math:`b = 0.5`.

Parameters
----------
num_id :
The number of units in the panel.
num_t :
The number of time periods in the panel.
num_x :
The number of confounding variables.
theta :
The value of the causal parameter.
dgp_type :
The type of DGP design to be used. Default is ``'dgp1'``, other options are ``'dgp2'`` and ``'dgp3'``.

Returns
-------
pandas.DataFrame
DataFrame containing the simulated static panel data.

References
----------
Clarke, P. S. and Polselli, A. (2025),
Double machine learning for static panel models with fixed effects. The Econometrics Journal, utaf011,
doi:`10.1093/ectj/utaf011 <https://doi.org/10.1093/ectj/utaf011>`_.
"""

# parameters
a = 0.25
b = 0.5
sigma2_a = 0.95
sigma2_x = 5

# id and time vectors
id_var = np.repeat(np.arange(1, num_id + 1), num_t)
time = np.tile(np.arange(1, num_t + 1), num_id)

# individual fixed effects
a_i = np.repeat(np.random.normal(0, np.sqrt(sigma2_a), num_id), num_t)
c_i = np.repeat(np.random.standard_normal(num_id), num_t)

# covariates and errors
x_mean = 0
x_it = np.random.normal(loc=x_mean, scale=np.sqrt(sigma2_x), size=(num_id * num_t, dim_x))
u_it = np.random.standard_normal(num_id * num_t)
v_it = np.random.standard_normal(num_id * num_t)

# functional forms in nuisance functions
if dgp_type == "dgp1":
l_0 = a * x_it[:, 0] + x_it[:, 2]
m_0 = a * x_it[:, 0] + x_it[:, 2]
elif dgp_type == "dgp2":
l_0 = np.divide(np.exp(x_it[:, 0]), 1 + np.exp(x_it[:, 0])) + a * np.cos(x_it[:, 2])
m_0 = np.cos(x_it[:, 0]) + a * np.divide(np.exp(x_it[:, 2]), 1 + np.exp(x_it[:, 2]))
elif dgp_type == "dgp3":
l_0 = b * (x_it[:, 0] * x_it[:, 2]) + a * (x_it[:, 2] * np.where(x_it[:, 2] > 0, 1, 0))
m_0 = a * (x_it[:, 0] * np.where(x_it[:, 0] > 0, 1, 0)) + b * (x_it[:, 0] * x_it[:, 2])
else:
raise ValueError("Invalid dgp type.")

# treatment
d_it = m_0 + c_i + v_it

def alpha_i(x_it, d_it, a_i, num_n, num_t):
d_i = np.array_split(d_it, num_n)
d_i_term = np.repeat(np.mean(d_i, axis=1), num_t) - np.mean(d_it)

x_i = np.array_split(np.sum(x_it[:, [0, 2]], axis=1), num_n)
x_i_mean = np.mean(x_i, axis=1)
x_i_term = np.repeat(x_i_mean, num_t)

alpha_term = 0.25 * d_i_term + 0.25 * x_i_term + a_i
return alpha_term

# outcome
y_it = d_it * theta + l_0 + alpha_i(x_it, d_it, a_i, num_id, num_t) + u_it

x_cols = [f"x{i + 1}" for i in np.arange(dim_x)]

data = pd.DataFrame(np.column_stack((id_var, time, y_it, d_it, x_it)), columns=["id", "time", "y", "d"] + x_cols).astype(
{"id": "int64", "time": "int64"}
)

return data
Loading
Loading