# Example Notebook

In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objs as go
from statsmodels.nonparametric.kernel_regression import KernelReg

from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression

from rdrobust import rdrobust

import doubleml as dml
from doubleml.rdd import RDFlex
from doubleml.rdd.datasets import make_simple_rdd_data

## Data

### Generate Data

In [2]:
np.random.seed(42)
data_dict = make_simple_rdd_data(n_obs=500)

cov_names = ['x' + str(i) for i in range(data_dict['X'].shape[1])]
df = pd.DataFrame(
    np.column_stack((data_dict['Y'], data_dict['D'], data_dict['score'], data_dict['X'])),
    columns=['y', 'd', 'score'] + cov_names,
)
df

Unnamed: 0,y,d,score,x0,x1,x2,x3,x4
0,-0.064332,1.0,-0.250920,0.414477,-0.694922,0.152577,0.213430,-0.151739
1,5.841572,1.0,0.901429,0.472888,0.868734,0.851137,-0.098321,-0.773524
2,-1.714202,0.0,0.463988,0.969682,0.677796,-0.750675,0.841684,0.739793
3,-0.708352,1.0,0.197317,0.037676,0.182551,-0.201995,-0.890477,-0.329606
4,-0.201265,1.0,-0.687963,0.605707,-0.990736,-0.333002,-0.203663,0.074791
...,...,...,...,...,...,...,...,...
495,1.608514,0.0,-0.293296,0.642777,0.686652,-0.029811,-0.332743,0.583116
496,3.994754,1.0,0.167312,-0.097412,-0.633116,0.709948,0.765120,-0.067380
497,0.690608,0.0,-0.844531,-0.848601,-0.224348,0.607075,0.803548,-0.593055
498,0.786864,1.0,0.948790,-0.866053,0.754701,-0.221190,0.083522,0.936132


In [3]:
fig = px.scatter(
    x=df['score'],
    y=df['y'],
    color=df['d'].astype(bool),
    labels={
        "x": "Score",   
        "y": "Outcome",
        "color": "Treatment"
    },
    title="Scatter Plot of Outcome vs. Score by Treatment Status"
)

fig.update_layout(
    xaxis_title="Score",
    yaxis_title="Outcome"
)
fig.show()

### Oracle Values and Comparisons

In [4]:
ite = data_dict['oracle_values']['Y1'] - data_dict['oracle_values']['Y0']
score = data_dict['score']

oracle_model = KernelReg(endog=ite, exog=score, reg_type='ll', var_type='c', ckertype='gaussian')

score_grid = np.linspace(-1, 1, 100)
oracle_effects, _ = oracle_model.fit(score_grid)


scatter = go.Scatter(
    x=score,
    y=ite,
    mode='markers',
    name='ITE',
    marker=dict(color='blue')
)
line = go.Scatter(
    x=score_grid,
    y=oracle_effects,
    mode='lines',
    name='Average Effect Estimate',
    line=dict(color='red')
)


fig = go.Figure(data=[scatter, line])
fig.update_layout(
    title='Locally Linear Kernel Regression of ITE on Score',
    xaxis_title='Score',
    yaxis_title='Effect',
    legend=dict(x=0.8, y=0.2)
)

fig.show()

## RDD with linear adjustment

In [5]:
rdrobust(y=df['y'], x=df['score'], fuzzy=df['d'], covs=df[cov_names], c=0.0)

Call: rdrobust
Number of Observations:                   500
Polynomial Order Est. (p):                  1
Polynomial Order Bias (q):                  2
Kernel:                            Triangular
Bandwidth Selection:                    mserd
Var-Cov Estimator:                         NN

                                Left      Right
------------------------------------------------
Number of Observations           241        259
Number of Unique Obs.            241        259
Number of Effective Obs.          69         93
Bandwidth Estimation           0.341      0.341
Bandwidth Bias                  0.51       0.51
rho (h/b)                      0.669      0.669

Method             Coef.     S.E.   t-stat    P>|t|       95% CI      
-------------------------------------------------------------------------
Conventional       1.016    0.684    1.485   1.376e-01    [-0.325, 2.357]
Robust                 -        -    1.243   2.140e-01     [-0.571, 2.55]




## RDD with flexible adjustment

In [6]:
dml_data = dml.DoubleMLData(df, y_col='y', d_cols='d', x_cols=cov_names, s_col='score')

In [9]:
ml_g = LGBMRegressor(n_estimators=100, learning_rate=0.01, verbose=-1)
ml_m = LGBMClassifier(n_estimators=100, learning_rate=0.01, verbose=-1)

ml_g = LinearRegression()
ml_m = LogisticRegression()

rdflex_model = RDFlex(dml_data,
                      ml_g,
                      ml_m,
                      n_folds=5,
                      n_rep=1)
rdflex_model.fit(n_iterations=1)

print(rdflex_model)

Method                  Coef.     S.E.    P>|t|            95% CI
-----------------------------------------------------------------
Conventional            0.497    1.562    0.751   -2.564 to 3.558   
Robust                      -        -    0.996   -3.483 to 3.499   


In [10]:
rdflex_model._rdd_obj[0]

Call: rdrobust
Number of Observations:                   228
Polynomial Order Est. (p):                  1
Polynomial Order Bias (q):                  2
Kernel:                            Triangular
Bandwidth Selection:                    mserd
Var-Cov Estimator:                         NN

                                Left      Right
------------------------------------------------
Number of Observations           106        122
Number of Unique Obs.            106        122
Number of Effective Obs.          30         43
Bandwidth Estimation           0.162      0.162
Bandwidth Bias                 0.251      0.251
rho (h/b)                      0.647      0.647

Method             Coef.     S.E.   t-stat    P>|t|       95% CI      
-------------------------------------------------------------------------
Conventional       0.497    1.562    0.318   7.505e-01    [-2.564, 3.558]
Robust                 -        -    0.005   9.964e-01    [-3.483, 3.499]


