# DP CATE Simulation: Part 2
#### Fengshi Niu, Harsha Nori, Brian Quistorff, Rich Caruana, Donald Ngwe, Aadharsh Kannan

This notebook contains experiments in the paper "Differentially Private Estimation of Heterogeneous Treatment Effects".

The code below has the following thing in order:
1. Two plots for exploratory analysis of a dataset
2. Figure 2, comparison of DP-EBM-DR-learner, DP-EBM-R-learner, and DP-EBM-S-learner
3. Figure 3, mse, bias, variance

In [None]:
# !pip install -U plotly plotly-orca kaleido

In [None]:
import numpy as np
import pandas as pd
import scipy
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from matplotlib import pyplot as plt

In [None]:
results = pd.read_csv('./sample_data/dp_cate_simulation_final.csv').replace({np.nan: None})
results.drop_duplicates(subset=['dataset', 'data_size', 'd', 
                                'sigma', 'learner', 'model', 'epsilon'],
                        keep='first', inplace=True, ignore_index=True)
results['MSE_mean'] = results['MSE'].apply(lambda l: 
                                           np.mean([i for i in eval(l) if i is not None]))
results['bias_ATE_mean'] = results['bias_ATE'].apply(lambda l: 
                                                     np.mean([i for i in eval(l) if i is not None]))
results['iBias2_mean'] = results['iBias2'].apply(lambda l: 
                                             np.mean([i for i in eval(l) if i is not None]))
results['iVar_mean'] = results['iVar'].apply(lambda l: 
                                             np.mean([i for i in eval(l) if i is not None]))
results['MSE_ATE_mean'] = results['MSE_ATE'].apply(lambda l: 
                                             np.mean([i for i in eval(l) if i is not None]))
results['iBias2_ATE_mean'] = results['iBias2_ATE'].apply(lambda l: 
                                             np.mean([i for i in eval(l) if i is not None]))
results['iVar_ATE_mean'] = results['iVar_ATE'].apply(lambda l: 
                                             np.mean([i for i in eval(l) if i is not None]))
# results['MSE_std'] = results['RMSE'].apply(lambda l: 
#                                             np.std([i for i in eval(l) if i is not None]))
# results['bias_ATE_std'] = results['bias_ATE'].apply(lambda l: 
#                                                     np.std([i for i in eval(l) if i is not None]))
results = results.drop(columns=['MSE','MSEavg', 'iBias2', 'iVar', 'bias_ATE', 'MSE_ATE'])

In [None]:
results.head()

Unnamed: 0,dataset,data_size,d,sigma,learner,model,epsilon,MSEavg_ATE,num_effective_avg,iBias2_ATE,iVar_ATE,MSE_mean,bias_ATE_mean,iBias2_mean,iVar_mean,MSE_ATE_mean,iBias2_ATE_mean,iVar_ATE_mean
0,Voting,8000,11,,SLearner1,Lasso,,"[0.01774654644053732, 0.01882658717591613, 0.0...","[5, 5, 5, 5, 5]","[0.017580479180748794, 0.0182816018193154, 0.0...","[0.0008303362989426407, 0, 0.00192600977867254...",0.01822,0.044406,0.018044,0.000177,0.01855,0.017808,0.000742
1,Voting,8000,11,,DRLearner,Lasso,,"[0.016108284622893057, 0.016220312811940515, 0...","[5, 5, 5, 5, 5]","[0.01606228287494413, 0.016149296132936374, 0....","[0.00023000873974462262, 0, 0.0001674532069208...",0.002774,0.004045,0.001504,0.001271,0.016229,0.016086,0.000143
2,Voting,8000,11,,RLearner,Lasso,,"[0.016110013277194375, 0.016171936833483603, 0...","[5, 5, 5, 5, 5]","[0.016039462334114452, 0.016108034728945824, 0...","[0.00035275471539961134, 0, 0.0001516484825301...",0.002672,0.002472,0.001345,0.001327,0.016235,0.016072,0.000163
3,Voting,4000,11,,SLearner1,Lasso,,"[0.017388493258822753, 0.0188775492571124, 0.0...","[5, 5, 5, 5, 5]","[0.017047397565209673, 0.018514881467831593, 0...","[0, 0.0018133389464040396, 0, 0.00721271160712...",0.018313,0.044421,0.018036,0.000277,0.019446,0.017362,0.002084
4,Voting,4000,11,,DRLearner,Lasso,,"[0.016106610171526852, 0.01618997985878405, 0....","[5, 5, 5, 5, 5]","[0.016106391319096233, 0.016161206699009653, 0...","[1.0942621530901295e-06, 0.0001438657988719980...",0.004347,0.000979,0.001361,0.002986,0.016529,0.01604,0.000489


## Figure 2, comparison across DP-EBM-?-learners

In [None]:
# MSE across learners
dataset = 'Voting'
data_sizes = [500, 1_000, 2_000, 4_000, 8_000, 16_000, 32_000]

fig = make_subplots(
    rows=1, cols=len(data_sizes),
    horizontal_spacing=0.037,
    subplot_titles=[f'n = {data_size}' for data_size in data_sizes])

color_DRLearner, color_SLearner, color_RLearner = px.colors.qualitative.Plotly[0:3]
color_dict = {'DRLearner': color_DRLearner, 'SLearner1': color_SLearner, 'RLearner': color_RLearner}

for i, data_size in enumerate(data_sizes):
    results_plot = results[(results['dataset']==dataset) & 
                          (results['model']=='DPExplainableBoostingRegressor') &
                          (results['data_size']==data_size)]
    
    results_plot = results_plot.sort_values('epsilon', ascending=False)
    results_by_learner = {learner: results_plot[results_plot['learner'] == learner]  
                          for learner in results_plot['learner'].unique()}
    if i == 0:
        showlegend = True
    else:
        showlegend = False
    
    for learner, result in sorted(results_by_learner.items()):
        epsilons = result['epsilon']
        fig.add_trace(go.Scatter(x=[1, 2, 3, 4, 5], y=result['MSE_mean'], 
                                 name=learner if learner != 'SLearner1' else 'SLearner',
                                 showlegend=showlegend,
                                 line={'color': color_dict[learner]}),
                      row=1, col=i+1)
    
fig.update_yaxes(type="log", range=[-3, 3])
    
fig.update_layout(
    # title="𝜀 - MSE, Voting",
    xaxis_title="𝜀",
    yaxis_title="MSE",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    margin=dict(
        t=20,
        b=10,
        l=10,
        r=10,
    ),
    legend_title_text='Learner',
    autosize=False,
    width=1_200,
    height=300
)

fig.update_xaxes(
    dict(
        title = "𝜀",
        tickmode = 'array',
        tickvals = [1, 2, 3, 4, 5],
        ticktext = [16, 8, 4, 2, 1]
    )
)
fig.show()    
fig.write_image("plots/ATE_CATE1.png", height=300, width=1_200)

In [None]:
# MSE across learners
dataset = 'Voting'
epsilons = [16, 8, 4, 2, 1]

fig = make_subplots(
    rows=1, cols=len(epsilons),
    horizontal_spacing=0.037,
    subplot_titles=[f'𝜀 = {epsilon}' for epsilon in epsilons])

color_DRLearner, color_SLearner, color_RLearner = px.colors.qualitative.Plotly[0:3]
color_dict = {'DRLearner': color_DRLearner, 'SLearner1': color_SLearner, 'RLearner': color_RLearner}

for i, epsilon in enumerate(epsilons):
    results_plot = results[(results['dataset']==dataset) & 
                          (results['model']=='DPExplainableBoostingRegressor') &
                          (results['epsilon']==epsilon)]
    
    results_plot = results_plot.sort_values('data_size')
    results_by_learner = {learner: results_plot[results_plot['learner'] == learner]  
                          for learner in results_plot['learner'].unique()}
    if i == 0:
        showlegend = True
    else:
        showlegend = False
    
    for learner, result in sorted(results_by_learner.items()):
        data_sizes = result['data_size']
        fig.add_trace(go.Scatter(x=[1,2,3,4,5,6,7], y=result['MSE_mean'], 
                                 name=learner if learner != 'SLearner1' else 'SLearner',
                                 showlegend=showlegend,
                                 line={'color': color_dict[learner]}),
                      row=1, col=i+1)
    
fig.update_yaxes(type="log", range=[-3, 3])
    
fig.update_layout(
    yaxis_title="MSE",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    margin=dict(
        t=20,
        b=10,
        l=10,
        r=10,
    ),
    legend_title_text='Learner',
    autosize=False,
      width=1_200,
      height=300
)

fig.update_xaxes(
    dict(
        title = "n",
        tickmode = 'array',
        tickvals = [1, 2, 3, 4, 5, 6, 7],
        ticktext = [500, 1000, 2000, 4000, 8000, 16000, 32000],
        tickangle=45,
    )
)

fig.write_image("plots/ATE_CATE2.png", height=300, width=1_200)
fig.show()

## Figure 3, MSE, bias, variance

In [None]:
# MSE, bias, variance
results = results[results['epsilon'] <= 100]

configs = [['Voting', 16_000, (-3.5, -0.8)],  
           ['A', 16_000, (-2.5, 0.2)], 
           ['B', 16_000, (-2.5, 0.2)], 
           ['C', 4_000, (-1.5, 1.2)], 
           ['D', 4_000, (-1.5, 1.2)],
           ['E', 4_000, (-0.5, 2.2)],
           ]

fig = make_subplots(
    rows=1, cols=len(configs),
    subplot_titles=[f'{dataset}' for dataset, data_size, _ in configs ])

color_mse, color_bias, color_var = px.colors.qualitative.Plotly[0:3]


for i in range(len(configs)):
    dataset, data_size, y_ranges = configs[i]
    results_plot = results[(results['dataset']==dataset) &
                      (results['data_size']==data_size) &
                          (results['model']=='DPExplainableBoostingRegressor') &
                          (results['learner']=='DRLearner')].sort_values('epsilon', ascending=False)
    epsilons = results_plot['epsilon']
    if i == 0:
        showlegend = True
    else:
        showlegend = False
    fig.add_trace(go.Scatter(x=[1,2,3,4,5], y=results_plot['MSE_mean'], name='MSE',
                             showlegend=showlegend,
                             line={'color': color_mse}),
                  row=1, col=i+1)
    fig.add_trace(go.Scatter(x=[1,2,3,4,5], y=results_plot['iBias2_mean'], name='Bias', 
                             showlegend=showlegend,
                             line={'color': color_bias}),
                  row=1, col=i+1)
    fig.add_trace(go.Scatter(x=[1,2,3,4,5], y=results_plot['iVar_mean'], name='Variance', 
                             showlegend=showlegend,
                             line={'color': color_var}),
                  row=1, col=i+1)
    fig.update_yaxes(type="log", dtick=1, range=list(y_ranges), row=1, col=i+1)

fig.update_layout(
    # title="𝜀 - MSE, Bias, and Variance, DRlearner",
    # xaxis_title="𝜀",
    yaxis_title="Squared Error",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    margin=dict(
        t=20,
        b=10,
        l=10,
        r=10,
    ),
    # legend_title_text='Metric',
    autosize=False,
      width=1200,
      height=300
)

fig.update_xaxes(
    dict(
        title = "𝜀",
        tickmode = 'array',
        tickvals = [1, 2, 3, 4, 5],
        ticktext = [16, 8, 4, 2, 1]
    )
)
fig.show()
fig.write_image("plots/mse_bias_variance.png", height=300, width=1_200, engine='kaleido')