# Example Notebook

In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objs as go
from statsmodels.nonparametric.kernel_regression import KernelReg

from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression

from rdrobust import rdrobust

import doubleml as dml
from doubleml.rdd import RDFlex
from doubleml.rdd.datasets import make_simple_rdd_data

## Data

### Generate Data

In [2]:
np.random.seed(42)
data_dict = make_simple_rdd_data(n_obs=5000)

cov_names = ['x' + str(i) for i in range(data_dict['X'].shape[1])]
df = pd.DataFrame(
    np.column_stack((data_dict['Y'], data_dict['D'], data_dict['score'], data_dict['X'])),
    columns=['y', 'd', 'score'] + cov_names,
)
df

Unnamed: 0,y,d,score,x0,x1,x2,x3
0,-1.882701,0.0,-0.250920,-0.252718,-0.334176,-0.647692,0.214533
1,1.928160,1.0,0.901429,-0.046752,0.731402,-0.935781,0.287736
2,4.583145,0.0,0.463988,0.525898,0.518973,0.772148,0.458067
3,3.054653,1.0,0.197317,0.855620,-0.334687,0.006419,-0.971841
4,-2.428923,0.0,-0.687963,-0.986085,-0.519747,-0.798386,-0.479577
...,...,...,...,...,...,...,...
4995,2.342546,1.0,0.794795,-0.553685,-0.224255,0.713344,0.477812
4996,-0.873502,0.0,-0.761238,-0.690818,0.215416,0.257281,-0.818863
4997,1.360486,1.0,-0.344314,-0.068389,-0.478663,-0.065740,0.554142
4998,1.814638,1.0,0.631491,-0.683481,-0.753392,-0.461367,0.904551


In [3]:
fig = px.scatter(
    x=df['score'],
    y=df['y'],
    color=df['d'].astype(bool),
    labels={
        "x": "Score",   
        "y": "Outcome",
        "color": "Treatment"
    },
    title="Scatter Plot of Outcome vs. Score by Treatment Status"
)

fig.update_layout(
    xaxis_title="Score",
    yaxis_title="Outcome"
)
fig.show()

### Oracle Values and Comparisons

In [4]:
ite = data_dict['oracle_values']['Y1'] - data_dict['oracle_values']['Y0']
score = data_dict['score']

oracle_model = KernelReg(endog=ite, exog=score, reg_type='ll', var_type='c', ckertype='gaussian')

score_grid = np.linspace(-1, 1, 100)
oracle_effects, _ = oracle_model.fit(score_grid)


scatter = go.Scatter(
    x=score,
    y=ite,
    mode='markers',
    name='ITE',
    marker=dict(color='blue')
)
line = go.Scatter(
    x=score_grid,
    y=oracle_effects,
    mode='lines',
    name='Average Effect Estimate',
    line=dict(color='red')
)


fig = go.Figure(data=[scatter, line])
fig.update_layout(
    title='Locally Linear Kernel Regression of ITE on Score',
    xaxis_title='Score',
    yaxis_title='Effect',
    legend=dict(x=0.8, y=0.2)
)

fig.show()

## RDD with linear adjustment

In [5]:
rdrobust(y=df['y'], x=df['score'], fuzzy=df['d'], covs=df[cov_names], c=0.0)

Call: rdrobust
Number of Observations:                  5000
Polynomial Order Est. (p):                  1
Polynomial Order Bias (q):                  2
Kernel:                            Triangular
Bandwidth Selection:                    mserd
Var-Cov Estimator:                         NN

                                Left      Right
------------------------------------------------
Number of Observations          2500       2500
Number of Unique Obs.           2500       2500
Number of Effective Obs.         647        672
Bandwidth Estimation           0.266      0.266
Bandwidth Bias                 0.421      0.421
rho (h/b)                      0.632      0.632

Method             Coef.     S.E.   t-stat    P>|t|       95% CI      
-------------------------------------------------------------------------
Conventional       1.444    0.333    4.338   1.437e-05     [0.792, 2.097]
Robust                 -        -    3.987   6.694e-05     [0.786, 2.306]




## RDD with flexible adjustment

In [6]:
dml_data = dml.DoubleMLData(df, y_col='y', d_cols='d', x_cols=cov_names, s_col='score')

In [7]:
ml_g = LGBMRegressor(n_estimators=100, learning_rate=0.01, verbose=-1)
ml_m = LGBMClassifier(n_estimators=100, learning_rate=0.01, verbose=-1)

rdflex_model = RDFlex(dml_data,
                      ml_g,
                      ml_m,
                      n_folds=5,
                      n_rep=1)
rdflex_model.fit(n_iterations=2)

print(rdflex_model)

Method                  Coef.     S.E.    P>|t|            95% CI
-----------------------------------------------------------------
Conventional            1.118    0.281      0.0    0.567 to 1.67    
Robust                      -        -    0.005    0.328 to 1.863   


In [8]:
rdflex_model._rdd_obj[0]

Call: rdrobust
Number of Observations:                   902
Polynomial Order Est. (p):                  1
Polynomial Order Bias (q):                  2
Kernel:                            Triangular
Bandwidth Selection:                   Manual
Var-Cov Estimator:                         NN

                                Left      Right
------------------------------------------------
Number of Observations           438        464
Number of Unique Obs.            438        464
Number of Effective Obs.         438        464
Bandwidth Estimation           0.189      0.189
Bandwidth Bias                 0.189      0.189
rho (h/b)                        1.0        1.0

Method             Coef.     S.E.   t-stat    P>|t|       95% CI      
-------------------------------------------------------------------------
Conventional       1.118    0.281    3.974   7.057e-05      [0.567, 1.67]
Robust                 -        -    2.798   5.137e-03     [0.328, 1.863]


