# DP CATE Simulation: Part 1

#### Fengshi Niu, Harsha Nori, Brian Quistorff, Rich Caruana, Donald Ngwe, Aadharsh Kannan

This notebook contains experiments in the paper "Differentially Private Estimation of Heterogeneous Treatment Effects".

The code below has the following thing in order:
1. Classes of DP CATE meta learners with sample splitting. 
    - DR-learner, R-learner, S-learner for DP CATE
2. Data generators for the six datasets
    - `['Voting', 'A', 'B', 'C', 'D', 'E']`
3. Experiment hyperparameters and helper functions
    - Experiment index by `['dataset', 'data_size', 'd', 'sigma', 'learner', 'model', 'epsilon']`
    - Metrics: MSE, integrated squared bias, integrated variance
    - All experiments fall into
    ```
    datasets_world = {'Voting', 'A', 'B', 'C', 'D', 'E'}
    datasizes_world = {500, 1_000, 2_000, 4_000, 8_000, 16_000, 32_000}
    ds_world = {None, 6}
    sigmas_world = {None, 0.5, 1, 2}
    learners_world = {'SLearner1', 'DRLearner', 'RLearner'}
    models_nonprivate_world = {'Lasso', 'ExplainableBoostingRegressor'}
    models_private_world = {'DPExplainableBoostingRegressor'}
    epsilons_world = {None, 1, 2, 4, 8, 16, 128}
    ```
4. Run experiments and save the results
    - saved as `./sample_data/dp_cate_simulation_final.csv`
5. Figure 1: EBM Shape Function Plots
6. Exploratory plots
7. NoisySGD based linear model and NN [to be finished]

In [None]:
# !pip install -U interpret econml opacus delayed scikit-learn plotly kaleido

In [None]:
import numpy as np
import pandas as pd
import scipy
from sklearn import clone
from sklearn.model_selection import train_test_split
from interpret.privacy import DPExplainableBoostingClassifier, DPExplainableBoostingRegressor
from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error
from econml.sklearn_extensions.model_selection import GridSearchCVList

from econml.metalearners import SLearner, XLearner, DomainAdaptationLearner
from econml.metalearners import TLearner as TLearnerEconML
from econml.dml import NonParamDML as RLearnerEconML
from econml.dr import DRLearner as DRLearnerEconML

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from matplotlib import pyplot as plt

## 1. Classes of DP CATE meta learners with sample splitting

In [None]:
# SLearner1, DRLearner, RLearner can be used to construct private algo 
class SLearner1(SLearner):
    def cate_pred(self, X):
        return self.effect(X)
    
    def ate(self, X):
        return self.cate_pred(X).mean()
    
    def mse(self, X_test, cate_test):
        return mean_squared_error(self.cate_pred(X_test), cate_test)

    
class DRLearner():
    def __init__(self,
                 model_propensity,
                 model_outcome, 
                 model_cate, 
                 clip=lambda x: np.clip(x, 0.05, 0.95), 
                 seed=123):
        self.model_propensity = clone(model_propensity, safe=True)
        self.model_outcome = clone(model_outcome, safe=True)
        self.model_cate = clone(model_cate, safe=True)
        self.clip = clip
        self.seed = seed
    
    def fit(self, y, T, X, sample_ratio_2nd_stage=0.5, sample_ratio_propensity=0.25):
        X1, X2, T1, T2, y1, y2 = train_test_split(X, T, y,
                                                  test_size=sample_ratio_2nd_stage, 
                                                  stratify=T,
                                                  random_state=self.seed)
        X11, X12, T11, T12, y11, y12 = train_test_split(X1, T1, y1, 
                        test_size=1 - sample_ratio_propensity/(1-sample_ratio_2nd_stage),
                        stratify=T1,
                        random_state=self.seed)
        self.model_propensity.fit(X11, T11)
        
        TX12 = np.concatenate((T12.reshape((-1, 1)), X12), axis=1)
        self.model_outcome.fit(TX12, y12)

        treatTX2 = np.concatenate((np.ones((X2.shape[0], 1)), X2), axis=1)
        controlTX2 = np.concatenate((np.zeros((X2.shape[0], 1)), X2), axis=1)

        y_treatment_dr = self.model_outcome.predict(treatTX2) \
                + (y2 - self.model_outcome.predict(treatTX2)) / (self.clip(self.model_propensity.predict_proba(X2)[:, 1])) * T2
        y_control_dr = self.model_outcome.predict(controlTX2) \
                + (y2 - self.model_outcome.predict(controlTX2)) / (self.clip(self.model_propensity.predict_proba(X2)[:, 0])) * (1 - T2)

        phi_dr = y_treatment_dr - y_control_dr

        self.model_cate.fit(X2, phi_dr)
       
    def cate_pred(self, X):
        return self.model_cate.predict(X)

    def propensity_pred(self, X):
        return self.clip(self.model_propensity.predict_proba(X))

    def ate(self, X):
        return self.model_cate.predict(X).mean()
    
    def mse(self, X_test, cate_test):
        return mean_squared_error(self.cate_pred(X_test), cate_test)

    
class RLearner():
    def __init__(self, 
                 model_propensity, 
                 model_outcome, 
                 model_cate, 
                 clip=lambda x: np.clip(x, 0.05, 0.95), 
                 seed=123):
        self.model_propensity = clone(model_propensity, safe=True)
        self.model_outcome = clone(model_outcome, safe=True)
        self.model_cate = clone(model_cate, safe=True)
        self.clip = clip
        self.seed = seed
    
    def fit(self, y, T, X, sample_ratio_2nd_stage=0.5, sample_ratio_propensity=0.25):
        # data splitting\n",
        X1, X2, T1, T2, y1, y2 = train_test_split(X, T, y,
                                                  test_size=sample_ratio_2nd_stage, 
                                                  stratify=T,
                                                  random_state=self.seed)
        X11, X12, T11, T12, y11, y12 = train_test_split(X1, T1, y1, 
                        test_size=1 - sample_ratio_propensity/(1-sample_ratio_2nd_stage),
                        stratify=T1,
                        random_state=self.seed)
        self.model_propensity.fit(X11, T11)
        self.model_outcome.fit(X12, y12)

        pseudo_outcome = (y2 - self.model_outcome.predict(X2)) / (T2 - self.clip(self.model_propensity.predict_proba(X2)[:, 1]))
        pseudo_weight = (T2 - self.clip(self.model_propensity.predict_proba(X2)[:, 1]))**2

        self.model_cate.fit(X=X2, y=pseudo_outcome, sample_weight=pseudo_weight)
       
    def cate_pred(self, X):
        return self.model_cate.predict(X)

    def propensity_pred(self, X):
        return self.clip(self.model_propensity.predict_proba(X))

    def ate(self, X):
        return self.model_cate.predict(X).mean()
    
    def mse(self, X_test, cate_test):
        return mean_squared_error(self.cate_pred(X_test), cate_test)

In [None]:
# # Learners below, mostly from econml, are not used to construct private algo
# class TLearner():
#     # Not private!
#     def __init__(self, models):
#         self.models = clone(models, safe=True)

#     def fit(self, y, T, X):
#         self.treatment_model = clone(self.models, safe=True)
#         self.control_model = clone(self.models, safe=True)
#         self.treatment_model.fit(X[T == 1], y[T == 1])
#         self.control_model.fit(X[T == 0], y[T == 0])
    
#     def cate_pred(self, X):
#         return self.treatment_model.predict(X) - self.control_model.predict(X)
    
#     def ate(self, X):
#         return self.cate_pred(X).mean()
    
#     def mse(self, X_test, cate_test):
#         return mean_squared_error(self.cate_pred(X_test), cate_test)    
    

# class SLearnerExpand():
#     def __init__(self, models):
#         self.models = clone(models, safe=True)

#     def fit(self, y, T, X):
#         X_all = np.concatenate((T.reshape([-1, 1]), T.reshape([-1, 1]) * X, X), axis=1)
#         self.alloutcome_model = clone(self.models, safe=True)
#         self.alloutcome_model.fit(X_all, y)

    
#     def cate_pred(self, X):
#         X_treat = np.concatenate((np.ones((X.shape[0], 1)), X, X), axis=1)
#         X_control = np.concatenate((np.zeros((X.shape[0], 1)), np.zeros(X.shape), X), axis=1)
#         return self.alloutcome_model.predict(X_treat) - self.alloutcome_model.predict(X_control)
    
#     def ate(self, X):
#         return self.cate_pred(X).mean()
    
#     def mse(self, X_test, cate_test):
#         return mean_squared_error(self.cate_pred(X_test), cate_test)


# class XLearner1(XLearner):
#     def cate_pred(self, X):
#         return self.effect(X)
    
#     def ate(self, X):
#         return self.cate_pred(X).mean()
    
#     def mse(self, X_test, cate_test):
#         return mean_squared_error(self.cate_pred(X_test), cate_test)
    
    
# class DomainAdaptationLearner1(DomainAdaptationLearner):
#     def cate_pred(self, X):
#         return self.effect(X)
    
#     def ate(self, X):
#         return self.cate_pred(X).mean()
    
#     def mse(self, X_test, cate_test):
#         return mean_squared_error(self.cate_pred(X_test), cate_test)
    

# class TLearnerEconML1(TLearnerEconML):
#     def cate_pred(self, X):
#         return self.effect(X)
    
#     def ate(self, X):
#         return self.cate_pred(X).mean()
    
#     def mse(self, X_test, cate_test):
#         return mean_squared_error(self.cate_pred(X_test), cate_test)
 

# class RLearnerEconML1(RLearnerEconML):
#     def cate_pred(self, X):
#         return self.effect(X)
    
#     def ate(self, X):
#         return self.cate_pred(X).mean()
    
#     def mse(self, X_test, cate_test):
#         return mean_squared_error(self.cate_pred(X_test), cate_test)

    
# class DRLearnerEconML1(DRLearnerEconML):
#     def cate_pred(self, X):
#         return self.effect(X)
    
#     def ate(self, X):
#         return self.cate_pred(X).mean()
    
#     def mse(self, X_test, cate_test):
#         return mean_squared_error(self.cate_pred(X_test), cate_test)



# # auto ml
# def auto_reg(random_state=123):
#     return GridSearchCVList([Lasso(random_state=random_state),
#                              RandomForestRegressor(n_estimators=400, 
#                                                    random_state=random_state, n_jobs=-2),
#                              LGBMRegressor(random_state=random_state, n_jobs=-2),
#                              ExplainableBoostingRegressor(random_state=random_state, n_jobs=-2)],
#                              param_grid_list=[{'alpha': [.001, .01, .1, 1]},
#                                               {'max_depth': [3, 6, None],
#                                                'min_samples_leaf': [10, 30]},
#                                               {},
#                                               {}],
#                              cv=2,
#                              scoring='neg_mean_squared_error',
#                              n_jobs=-2)

# def auto_clf(random_state=123):
#     return GridSearchCVList([LogisticRegression(random_state=random_state),
#                              RandomForestClassifier(n_estimators=400, 
#                                                     random_state=random_state, n_jobs=-2),
#                              LGBMClassifier(random_state=random_state, n_jobs=-2),
#                              ExplainableBoostingClassifier(random_state=random_state, n_jobs=-2)],
#                              param_grid_list=[{'C': [0.001, 0.01, .1, 1]},
#                                               {'max_depth': [3, 6, None],
#                                               'min_samples_leaf': [10, 30]},
#                                               {},
#                                               {}],
#                              cv=2,
#                              scoring='neg_log_loss',
#                              n_jobs=-2)

## 2. Data generator

In [None]:
def data_generator_base(n, 
                        d, 
                        covariates_model, 
                        propensity_model,
                        control_outcome_model,
                        treatment_effect_model,
                        special_test_point=None,
                        special_test_value=None,
                        seed=1234):
    """Generates iid population data for given covariates_model, propensity_model, 
    control_outcome_model, and treatment_effect_model.
    
    Parameters
    ----------
        n (int): total sample size. An additional of max(1000, 0.5*n) sample is used for test
        d (int): number of covariates
        covariates_model (func): d-dimension covariate DGP, take in an input d
        propensity_model (func): probability of treatment conditional on covariates
        control_outcome_model (func): actual untreated outcome
        treatment_effect_model (func): actual treatment effect
    """
    np.random.seed(seed)
    idx = np.arange(int(max(n * 1.5, n+1000))).reshape([-1, 1])
    # Generate covariates
    X = np.apply_along_axis(lambda i: covariates_model(d), 1, idx)
    # Generate treatment
    P = np.apply_along_axis(lambda x: propensity_model(x), 1, X).reshape([-1, 1])
    T = np.apply_along_axis(lambda p: np.random.binomial(1, p, 1), 1, P).reshape([-1, 1])
    # Generate outcome
    Y0 =  np.apply_along_axis(lambda x: control_outcome_model(x), 1, X).reshape([-1, 1])
    treatment_effect = np.apply_along_axis(lambda x: treatment_effect_model(x), 1, X).reshape([-1, 1])
    Y1 = Y0 + treatment_effect
    Y = Y0 * (1-T) + Y1 * T
    P, T, Y0, Y1, Y = P.ravel(), T.ravel(), Y0.ravel(), Y1.ravel(), Y.ravel()

    X, X_test = X[:n], X[n:]
    P, P_test = P[:n], P[n:]
    T, T_test = T[:n], T[n:]
    Y0, Y0_test = Y0[:n], Y0[n:]
    Y1, Y1_test = Y1[:n], Y1[n:]
    Y, Y_test = Y[:n], Y[n:]
    treatment_effect = treatment_effect.flatten()
    treatment_effect, treatment_effect_test = treatment_effect[:n], treatment_effect[n:]

    data = {'X': X, 
            'P': P,
            'T': T, 
            'Y': Y,
            'Y0': Y0, 
            'Y1': Y1, 
            'treatment_effect': treatment_effect, 
            'X_test': X_test, 
            'treatment_effect_test': treatment_effect_test,
            'Y_test': Y_test,
            'special_test': [special_test_point, special_test_value]
            }

    return data


def data_generator(n, 
                   d, 
                   covariates_model, 
                   propensity_model,
                   conditional_control_outcome_model,
                   sigma_control,
                   cate_model,
                   sigma_treatment_effect,
                   special_test_point=None,
                   special_test_value=None,
                   seed=1234):
    """A restricted version of data_generator_base
    
    Parameters
    ----------
        n (int): total sample size. An additional of max(1000, 0.5*n) sample is used for test
        d (int): number of covariates
        covariates_model (func): d-dimension covariate DGP, take in an input d
        propensity_model (func): probability of treatment conditional on covariates
        conditional_control_outcome_model (func): expected untreated outcome conditional on covariates
        sigma_control: noise level in control_outcome_model
        cate_model (func): conditional average treatment effect given covariates X
        sigma_treatment_effect (float): noise level in treatment_effect_model
    """
    data = data_generator_base(n=n, 
                               d=d, 
                               covariates_model=covariates_model, 
                               propensity_model=propensity_model,
                               control_outcome_model=lambda x: conditional_control_outcome_model(x) + np.random.normal(0, sigma_control, size=1),
                               treatment_effect_model=lambda x: cate_model(x) + np.random.normal(0, sigma_treatment_effect, size=1),
                               special_test_point=special_test_point,
                               special_test_value=special_test_value,
                               seed=seed)
    data['cate'] = np.apply_along_axis(lambda x: cate_model(x), 1, data['X']).flatten()
    data['cate_test'] = np.apply_along_axis(lambda x: cate_model(x), 1, data['X_test']).flatten()
    return data
    

# np.random.seed(1)
# m_0 = np.random.uniform(-1, 1, (12, 12))
m_0 = np.array([[-0.16595599,  0.44064899, -0.99977125, -0.39533485, -0.70648822,
        -0.81532281, -0.62747958, -0.30887855, -0.20646505,  0.07763347,
        -0.16161097,  0.370439  ],
       [-0.5910955 ,  0.75623487, -0.94522481,  0.34093502, -0.1653904 ,
         0.11737966, -0.71922612, -0.60379702,  0.60148914,  0.93652315,
        -0.37315164,  0.38464523],
       [ 0.7527783 ,  0.78921333, -0.82991158, -0.92189043, -0.66033916,
         0.75628501, -0.80330633, -0.15778475,  0.91577906,  0.06633057,
         0.38375423, -0.36896874],
       [ 0.37300186,  0.66925134, -0.96342345,  0.50028863,  0.97772218,
         0.49633131, -0.43911202,  0.57855866, -0.79354799, -0.10421295,
         0.81719101, -0.4127717 ],
       [-0.42444932, -0.73994286, -0.96126608,  0.35767107, -0.57674377,
        -0.46890668, -0.01685368, -0.89327491,  0.14823521, -0.70654285,
         0.17861107,  0.39951672],
       [-0.79533114, -0.17188802,  0.38880032, -0.17164146, -0.90009308,
         0.07179281,  0.32758929,  0.02977822,  0.88918951,  0.17311008,
         0.80680383, -0.72505059],
       [-0.72144731,  0.61478258, -0.20464633, -0.66929161,  0.85501716,
        -0.30446828,  0.50162421,  0.45199597,  0.76661218,  0.24734441,
         0.50188487, -0.30220332],
       [-0.46014422,  0.79177244, -0.14381762,  0.92968009,  0.326883  ,
         0.24339144, -0.77050805,  0.89897852, -0.10017573,  0.15677923,
        -0.18372639, -0.52594604],
       [ 0.80675904,  0.14735897, -0.99425935,  0.23428983, -0.3467102 ,
         0.0541162 ,  0.7718842 , -0.28546048,  0.8170703 ,  0.24672023,
        -0.96835751,  0.85887447],
       [ 0.38179384,  0.9946457 , -0.65531898, -0.7257285 ,  0.86519093,
         0.39363632, -0.86799965,  0.51092611,  0.50775238,  0.84604907,
         0.42304952, -0.75145808],
       [-0.96023973, -0.94757803, -0.94338702, -0.50757786,  0.7200559 ,
         0.07766213,  0.10564396,  0.68406178, -0.75165337, -0.44163264,
         0.17151854,  0.9391915 ],
       [ 0.12206044, -0.96270542,  0.60126535, -0.53405145,  0.61421039,
        -0.22427871,  0.72708371,  0.49424329,  0.11248047, -0.72708955,
        -0.88016462, -0.75731309]])
cov_0 = m_0 @ m_0.T

def data_simulate_prepare(dataset, d=6, data_size=500_000, sigma=1, seed=1234):
    # ABCD are specifications of simulations in RLearner paper
    if dataset == 'A': 
        L, U = 0.1, 0.9
        covariates_model = lambda d: np.random.uniform(0, 1, d)
        propensity_model = lambda x: np.clip(np.sin(np.pi * x[0] * x[1]), L, U) # Need to bound it below and above
        conditional_control_outcome_model = lambda x: np.sin(np.pi * x[0] * x[1]) + 2 * (x[2]-0.5)**2 + x[3] + 0.5 * x[4]
        cate_model = lambda x: (x[0] + x[1])/2
    elif dataset == 'B': 
        covariates_model = lambda d: np.random.normal(0, 1, d)
        propensity_model = lambda x: 0.5
        conditional_control_outcome_model = lambda x: max(0, x[0]+x[1], x[2]) + max(0, x[3]+x[4])
        cate_model = lambda x: x[0] + np.log(1 + np.exp(x[1]))
    elif dataset == 'C': 
        covariates_model = lambda d: np.random.normal(0, 1, d)
        propensity_model = lambda x: 1/(1 + np.exp(x[1] + x[2]))
        conditional_control_outcome_model = lambda x: 2 * np.log(1 + np.exp(x[0] + x[1] + x[2]))
        cate_model = lambda x: 1
    elif dataset == 'D': 
        covariates_model = lambda d: np.random.normal(0, 1, d)
        propensity_model = lambda x: 1/(1 + np.exp(-x[0]) + np.exp(-x[1]))
        conditional_control_outcome_model = lambda x: max(0, x[0]+x[1]) + max(0, x[3]+x[4])
        cate_model = lambda x: max(x[0] + x[1] + x[2], 0) - max(x[3] + x[4], 0)
    elif dataset == 'E':
        cov = cov_0[:d, :d]
        d_cont = d//2
        d_discrete = d - d//2
        mean_cont = np.arange(0, 3, d_cont)
        threshold_discrete = np.ones(d_discrete)
        beta = np.linspace(0, 3, d)
        gamma = np.ones(d)
        gamma[0] = 0
        gamma[2] = -1
        L, U = 0.1, 0.9
        def covariates_model(d):
            latent = np.random.multivariate_normal(np.zeros(d), cov)
            x_cont = latent[:d_cont] + mean_cont
            x_discrete = (latent[d_cont:] > threshold_discrete).astype(int)
            return np.concatenate((x_cont, x_discrete))
        propensity_model = lambda x: np.clip(scipy.special.expit(x[0] + x[-1]), L, U) # Need to bound it below and above
        conditional_control_outcome_model = lambda x: np.dot(x, beta) + (x[0] + 1)*np.random.normal(0, 1, size=1) + x[0]*x[-1] + (0.8 if (x[2]>-0.5 and x[2]<0.5) else 0.2)
        cate_model = lambda x: np.dot(x, gamma) + 5 * scipy.special.expit(x[0])

    data = data_generator(n=data_size, 
                          d=d, 
                          covariates_model=covariates_model,
                          propensity_model=propensity_model,
                          conditional_control_outcome_model=conditional_control_outcome_model,
                          sigma_control=sigma,
                          cate_model=cate_model,
                          sigma_treatment_effect=0, 
                          seed=seed)
    return data
    

# Prepare voting data with synthetic tau
# From RLearner paper, github xnie
def data_voting_prepare():
    url = "https://raw.githubusercontent.com/xnie/rlearner/master/experiments_for_paper/section2_example/data_clean.csv"
    data_clear = pd.read_csv(url)

    n = 100_000
    n_test = data_clear.shape[0] - n

    X = data_clear.iloc[:, 0:-2]
    Y_obs = data_clear.loc[:, 'Y']
    T = data_clear.loc[:, 'W']
    idx = np.arange(len(Y_obs)).reshape([-1,1])

    TAU_all = -X.loc[:, 'vote00'] * 0.5 / (1 + 50 / X.loc[:, 'age'])

    FLIP = np.random.binomial(1, np.abs(TAU_all))

    def synthetic_ypo(y_obs, tau, flip):
        if (flip == 0) or (tau == 0):
            return (y_obs, y_obs)
        elif tau > 0:
            return (0, 1)
        else:
            return (1, 0)

    Y_po = np.apply_along_axis(lambda i: np.array(synthetic_ypo(Y_obs[i[0]], 
                                                                TAU_all[i[0]], 
                                                                FLIP[i[0]])), 
                            1, idx)
    Y0 = Y_po[:, 0]
    Y1 = Y_po[:, 1]
    treatment_effect = (Y1 - Y0).flatten()
    Y = Y0 * (1-T) + Y1 * T
    X = np.array(X)

    P, P_test = None, None
    X, X_test = X[:n], X[n:]
    T, T_test = T[:n], T[n:]
    Y0, Y0_test = Y0[:n], Y0[n:]
    Y1, Y1_test = Y1[:n], Y1[n:]
    Y, Y_test = Y[:n], Y[n:]
    TAU_all = np.array(TAU_all).flatten()
    treatment_effect, treatment_effect_test = treatment_effect[:n], treatment_effect[n:]
    cate, cate_test = TAU_all[:n], TAU_all[n:]

    data_voting = {'X':X, 
                   'P': None,
                   'T': T, 
                   'Y': Y,
                   'Y0': Y0, 
                   'Y1': Y1, 
                   'treatment_effect': treatment_effect, 
                   'cate': cate,
                   'X_test': X_test, 
                   'treatment_effect_test': treatment_effect_test,
                   'cate_test': cate_test,
                   'Y_test': Y_test
                   }
    
    return data_voting

def sample_from_data(data, n, seed=123):
    X_sub, _, T_sub, _, Y_sub, _, Y0_sub, _, Y1_sub, _, treatment_effect_sub, _, cate_sub, _ = train_test_split(
                                                  data['X'], data['T'], data['Y'], data['Y0'],
                                                  data['Y1'], data['treatment_effect'], data['cate'],
                                                  test_size=1-n/len(data['Y']), 
                                                  stratify=data['T'],
                                                  random_state=seed)
    data_sample = {'X':X_sub, 
                   'P': None,
                   'T': T_sub, 
                   'Y': Y_sub,
                   'Y0': Y0_sub, 
                   'Y1': Y1_sub, 
                   'treatment_effect': treatment_effect_sub, 
                   'cate': cate_sub,
                   'X_test': data['X_test'], 
                   'treatment_effect_test':  data['treatment_effect_test'],
                   'cate_test': data['cate_test'],
                   'Y_test':  data['Y_test']}
    return data_sample

In [None]:
data_voting = data_voting_prepare()
data_A = data_simulate_prepare(dataset='A')
data_B = data_simulate_prepare(dataset='B')
data_C = data_simulate_prepare(dataset='C')
data_D = data_simulate_prepare(dataset='D')
data_E = data_simulate_prepare(dataset='E')

## 3. Experiment hyperparameters and helper functions

In [None]:
# All learners and all models should be within these world
# exp is indexed by 'dataset', 'data_size', 'd', 'sigma', 'learner', 'model', 'epsilon'
datasets_world = {'Voting', 'A', 'B', 'C', 'D', 'E'}
datasizes_world = {500, 1_000, 2_000, 4_000, 8_000, 16_000, 32_000}
ds_world = {None, 6, 12}
sigmas_world = {None, 0.5, 1, 2}
learners_world = {'SLearner1',
                  'DRLearner',
                  'RLearner',
#                   'TLearner',
#                   'SLearnerExpand',
#                   'XLearner1',
#                   'DomainAdaptationLearner1',
#                   'TLearnerEconML1',
#                   'RLearnerEconML1',
#                   'DRLearnerEconML1'
                 }
models_nonprivate_world = {'Lasso',  
                           'ExplainableBoostingRegressor', 
#                            'auto_reg', 
#                            'LGBMRegressor'
                          }
models_private_world = {'DPExplainableBoostingRegressor',
#                         'LinearNoisySGD',
#                         'NNNoisySGD'
                       }
models_world = models_private_world.union(models_nonprivate_world)
epsilons_world = {None, 1, 2, 4, 8, 16, 128}

dict_learners = {'SLearner1' : SLearner1,
                 'DRLearner' : DRLearner,
                 'RLearner' : RLearner,
#                  'TLearner' : TLearner,
#                  'SLearnerExpand' : SLearnerExpand,
#                  'XLearner1' : XLearner1,
#                  'DomainAdaptationLearner1' : DomainAdaptationLearner1,
#                  'TLearnerEconML1' : TLearnerEconML1,
#                  'RLearnerEconML1' : RLearnerEconML1,
#                  'DRLearnerEconML1' : DRLearnerEconML1
                }

dict_models = {'Lasso' : Lasso,
               'ExplainableBoostingRegressor' : ExplainableBoostingRegressor,
               'DPExplainableBoostingRegressor' : DPExplainableBoostingRegressor,
#                'auto_reg' : auto_reg,
#                'LGBMRegressor' : LGBMRegressor,
               }

In [None]:
# Helper Functions
def generate_exp_data(exp, 
                      seed, 
                      data_voting=data_voting,
                      data_A=data_A,
                      data_B=data_B,
                      data_C=data_C,
                      data_D=data_D,
                      data_E=data_E,
                      generate_new=False):
    if exp['dataset'] == 'Voting': 
        data = data_voting
    elif exp['dataset'] == 'A':
        data = data_A
    elif exp['dataset'] == 'B':
        data = data_B
    elif exp['dataset'] == 'C':
        data = data_C
    elif exp['dataset'] == 'D':
        data = data_D
    elif exp['dataset'] == 'E':
        data = data_E
    data = sample_from_data(data, exp['data_size'], seed)
    
    return data


def instantiate_algo(exp, seed, dict_learners=dict_learners, dict_models=dict_models,):
    reg_class_map = {
        'ExplainableBoostingRegressor' : ExplainableBoostingClassifier, 
        'DPExplainableBoostingRegressor' : DPExplainableBoostingClassifier, 
        'Lasso' : LogisticRegression,
#         'LGBMRegressor' : LGBMClassifier,
#         'auto_reg' : auto_clf
        }
    
    learner_name = exp['learner']
    model_name = exp['model']
    learner = dict_learners[learner_name]
    model = dict_models[model_name]
    
    model_args = {'epsilon' : exp['epsilon']} if exp['epsilon'] is not None else {}
    model_args['random_state'] = seed
    model_args_propensity = model_args.copy()
    
    if model_name == 'Lasso':
        model_args['alpha'] = 0.01
        model_args_propensity['C'] = 0.01
    if model_name in ['ExplainableBoostingRegressor', 'LGBMRegressor']:
        model_args['n_jobs'] = -2
        model_args_propensity['n_jobs'] = -2
    if model_name == 'ExplainableBoostingRegressor':
        model_args['interactions'] = 0
        model_args['outer_bags'] = 20
#         model_args_propensity['outer_bags'] = 20
#         model_args['outer_bags'] = 100
#         model_args_propensity['outer_bags'] = 100
    if learner_name in ['DRLearner', 'RLearner']:
        algo = learner(model_propensity=reg_class_map[model_name](**model_args_propensity), 
                       model_outcome=model(**model_args), 
                       model_cate=model(**model_args),
                       seed = seed)
    elif learner_name in ['SLearnerExpand', 'TLearner', 'TLearnerEconML1']:
        algo = learner(models=model(**model_args))
    elif learner_name == 'SLearner1':
        algo = learner(overall_model=model(**model_args))
    elif learner_name == 'XLearner1':
        algo = learner(models=model(**model_args),
                       propensity_model=reg_class_map[model_name](**model_args_propensity))
    elif learner_name == 'DomainAdaptationLearner1':
        algo = learner(models=model(**model_args), 
                       final_models=model(**model_args), 
                       propensity_model=reg_class_map[model_name](**model_args_propensity))
    elif learner_name == 'RLearnerEconML1':
        algo = learner(model_y=model(**model_args),
                       model_t=reg_class_map[model_name](**model_args_propensity), 
                       model_final=model(**model_args),
                       discrete_treatment=True,
                       random_state=seed)
    elif learner_name == 'DRLearnerEconML1':
        algo = learner(model_regression=model(**model_args),
                       model_final=model(**model_args),
                       model_propensity=reg_class_map[model_name](**model_args_propensity),
                       random_state=seed)
    return algo


# Generate full powerset of experiments to run, with some exceptions
from itertools import product
def generate_experiments(DATASETS, DATA_SIZES, Ds, 
                         SIGMA, LEARNERS, MODELS, EPSILONS,
                         learners_world=learners_world,
                         models_nonprivate_world=models_nonprivate_world,
                         models_private_world=models_private_world
                        ):
    experiments = []
    for exp in product(DATASETS, DATA_SIZES, Ds, SIGMA, LEARNERS, MODELS, EPSILONS):
        d = { v : exp[i]  for i, v in enumerate(['dataset', 'data_size', 'd', 'sigma', 
                                                 'learner', 'model', 'epsilon'])}

        # Ignore Voting with d or sigma
        if (d['dataset'] == 'Voting') and ((d['d'] is not None) or (d['sigma'] is not None)):
            continue

        if (d['dataset'] == 'Voting'):
            d['d'] = 11
        
        if (d['dataset'] not in ['Voting', 'A']) and (d['data_size'] > 8_000):
            continue

        # Ignore DGP-1, A, B, C, D with None sigma or None d
        if (d['dataset'] != 'Voting') and ((d['d'] is None) or (d['sigma'] is None)):
            continue

        # Ignore non-private models with non-zero epsilon
        if (d['model'] in models_nonprivate_world) and (d['epsilon'] is not None):
            continue

        # Ignore private models with no epsilon
        if (d['model'] in models_private_world) and (d['epsilon'] is None):
            continue
            
        # Ignore DP models with learners without sample splitting
        if (d['model'] in models_private_world) and (not (d['learner'] in ['SLearner1', 'SLearnerExpand', 'DRLearner', 'RLearner'])):
            continue 
            
        experiments.append(d)
    df_experiments = pd.DataFrame(experiments).replace({np.nan: None})
    df_experiments['priority'] = df_experiments['model'].apply(order_model)
    experiments = df_experiments.sort_values(by=['priority', 'dataset'], 
                                      ascending=[True, False]).drop(columns='priority').to_dict('records')
    return experiments

def order_model(learner):
    if learner == 'Lasso':
        return 1
    if learner == 'DPExplainableBoostingRegressor':
        return 2
    if learner == 'ExplainableBoostingRegressor':
        return 3
    if learner == 'LGBMRegressor':
        return 4
    if learner == 'auto_reg':
        return 5

    
# check experiments setup is legit
def check_exp_input(DATASETS, DATA_SIZES, Ds, SIGMA, LEARNERS, MODELS, EPSILONS,
                    datasets_world=datasets_world,
                    datasizes_world=datasizes_world,
                    ds_world=ds_world,
                    sigmas_world=sigmas_world,
                    learners_world=learners_world,
                    models_world=models_world,
                    epsilons_world=epsilons_world):
    assert set(DATASETS).issubset(datasets_world), 'DATASETS should be a subset of datasets_world.'
    assert set(DATA_SIZES).issubset(datasizes_world), 'DATA_SIZES should be a subset of datasizes_world.'
    assert set(Ds).issubset(ds_world), 'Ds should be a subset of ds_world.'
    assert set(SIGMA).issubset(sigmas_world), 'SIGMA should be a subset of sigmas_world.'
    assert set(LEARNERS).issubset(learners_world), 'LEARNERS should be a subset of learners_world.'
    assert set(MODELS).issubset(models_world), 'MODELS should be a subset of models_world.'
    assert set(EPSILONS).issubset(epsilons_world), 'EPSILONS should be a subset of epsilons_world.'
    print('Experiment inputs are legit.')

def already_exist(exp, results_index_log):
    if exp['d'] is None:
        exp['d'] = 11
    return results_index_log.isin(exp.values()).all(axis=1).any()

## 4. Run the experiment and save the results

In [None]:
DATASETS = datasets_world
DATA_SIZES = datasizes_world
Ds = [None, 6]
SIGMA = [None, 1]
LEARNERS = ['SLearner1',
            'DRLearner',
            'RLearner',
            ]
MODELS =  ['DPExplainableBoostingRegressor', 
           'Lasso'
          ]
EPSILONS = epsilons_world
DELTA = 1e-5

In [None]:
%ls sample_data

dp_cate_simulation.csv        dp_cate_simulation_test.csv
dp_cate_simulation_final.csv


In [None]:
# %rm sample_data/dp_cate_simulation_final.csv

In [None]:
# path_log_csv = './sample_data/dp_cate_simulation_test.csv'
path_log_csv = './sample_data/dp_cate_simulation_final.csv'

START_SEED = 1
N_FULL_EXPERIMENTS = 5
N_ALGO_REPEAT = 5

check_exp_input(DATASETS, DATA_SIZES, Ds, SIGMA, LEARNERS, MODELS, EPSILONS)
experiments = generate_experiments(DATASETS, DATA_SIZES, Ds, SIGMA, LEARNERS, MODELS, EPSILONS)
results = pd.DataFrame(columns=['dataset', 'data_size', 'd', 'sigma', 'learner', 'model', 'epsilon', 
                                'MSE', 'MSE_ATE', 'MSEavg', 'MSEavg_ATE', 'num_effective_avg',
                                'iBias2', 'iVar','bias_ATE', 'iBias2_ATE', 'iVar_ATE'])
try: 
    results_log = pd.read_csv(path_log_csv)
    results_log.replace({np.nan: None}, inplace=True)
except:
    results_log = results
results_index_log = results_log[['dataset', 'data_size', 'd', 'sigma', 
                                 'learner', 'model', 'epsilon']]

from tqdm.notebook import tqdm

for exp in tqdm(experiments):
    if already_exist(exp, results_index_log):
        print('skip already exist')
        continue
    print(exp)
    
    mse = [None] * N_FULL_EXPERIMENTS
    mseAvg = [None] * N_FULL_EXPERIMENTS
    mseATE = [None] * N_FULL_EXPERIMENTS
    mseATEavg = [None] * N_FULL_EXPERIMENTS
    num_effective_avg = [None] * N_FULL_EXPERIMENTS
    ivar = [None] * N_FULL_EXPERIMENTS
    ivarATE = [None] * N_FULL_EXPERIMENTS
    ibias2 = [None] * N_FULL_EXPERIMENTS
    ibias2ATE = [None] * N_FULL_EXPERIMENTS
    biasATE = [None] * N_FULL_EXPERIMENTS

    n = exp['data_size']
    if (exp['dataset']=='Voting') and (exp['data_size'] == 32_000):
        N_algo_repeat = 3
    elif (exp['dataset']=='Voting') and (exp['data_size'] == 64_000):
        N_algo_repeat = 2
    else:
        N_algo_repeat = N_ALGO_REPEAT
    exp['data_size'] = N_algo_repeat * n
    
    for i in range(N_FULL_EXPERIMENTS):
        data = generate_exp_data(exp, seed=START_SEED+100*i)
        ate = data['cate_test'].mean()
        mse_temp = 0
        mseATE_temp = 0
        m = 0
        cate_pred_test = np.zeros(data['cate_test'].shape)
        ate_pred_test = 0
       
        for j in range(N_algo_repeat):
            algo = instantiate_algo(exp, seed=(START_SEED+100*i))
            try:
                _ = algo.fit(data['Y'][j*n:(j+1)*n], T=np.array(data['T'])[j*n:(j+1)*n], X=data['X'][j*n:(j+1)*n])
                m += 1
                mse_temp = mse_temp * (m-1)/m + algo.mse(data['X_test'], data['cate_test'])/m
                ate_temp = algo.ate(data['X_test'])
                mseATE_temp = mean_squared_error(np.repeat(ate_temp, len(data['X_test'])), data['cate_test'])
                cate_pred_test = cate_pred_test * (m-1)/m + algo.cate_pred(data['X_test'])/m
                ate_pred_test = ate_pred_test * (m-1)/m + ate_temp/m
            except:
                print('Error: Fail to fit')
                if exp['model'] == 'DPExplainableBoostingRegressor':
                    continue
        if m >= 1:
            mse[i] = mse_temp
            mseAvg[i] = mean_squared_error(cate_pred_test, data['cate_test'])
            mseATE[i] = mseATE_temp
            mseATEavg[i] = mean_squared_error(np.repeat(ate_pred_test, len(data['X_test'])), data['cate_test'])
            num_effective_avg[i] = m
            biasATE[i] = ate_pred_test - ate
        if m >= 2:
            ivar[i] = min(max((mse[i] - mseAvg[i]) * m / (m-1), 0), mse[i])
            ivarATE[i] = min(max((mseATE[i] - mseATEavg[i]) * m / (m-1), 0), mseATE[i])
            ibias2[i] = mse[i] - ivar[i]
            ibias2ATE[i] = mseATE[i] - ivarATE[i]
            
    exp['data_size'] = n
    exp['MSE'] = mse
    exp['MSE_ATE'] = mseATE
    exp['MSEavg'] = mseAvg
    exp['MSEavg_ATE'] = mseATEavg
    exp['num_effective_avg'] = num_effective_avg
    exp['iBias2'] = ibias2
    exp['iVar'] = ivar
    exp['bias_ATE'] = biasATE
    exp['iBias2_ATE'] = ibias2ATE
    exp['iVar_ATE'] = ivarATE

    results.loc[len(results)] = exp

Experiment inputs are legit.


  0%|          | 0/714 [00:00<?, ?it/s]

skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already

skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already exist
skip already

In [None]:
results

Unnamed: 0,dataset,data_size,d,sigma,learner,model,epsilon,MSE,MSE_ATE,MSEavg,MSEavg_ATE,num_effective_avg,iBias2,iVar,bias_ATE,iBias2_ATE,iVar_ATE


### Save the results

In [None]:
%ls sample_data

dp_cate_simulation.csv        dp_cate_simulation_test.csv
dp_cate_simulation_final.csv


In [None]:
results_combined = pd.concat([results_log, results], ignore_index=True)
results_combined.drop_duplicates(subset=['dataset', 'data_size', 'd', 
                                         'sigma', 'learner', 'model', 'epsilon'],
                                 keep='first', inplace=True, ignore_index=True)
results_combined.to_csv(path_log_csv, index=False)

In [None]:
%ls sample_data

dp_cate_simulation.csv        dp_cate_simulation_test.csv
dp_cate_simulation_final.csv


## 5. Figure 1: EBM Shape Function Plots

In [None]:
from tqdm.notebook import tqdm

In [None]:
exp = {'dataset': 'Voting',
       'data_size': 16_000,
       'd': 11,
       'sigma': None,
       'learner': 'DRLearner',
       'model': 'DPExplainableBoostingRegressor'}
epsilons = [1, 2, 4, 8, 16]

results_plot = pd.DataFrame(columns=['dataset', 'data_size', 'd', 'sigma', 
                                     'learner', 'model', 'epsilon', 'algo_fitted']) 

data = generate_exp_data(exp, seed=2021)

for epsilon in tqdm(epsilons):
    exp_temp = exp.copy()
    exp_temp['epsilon'] = epsilon
    algo = instantiate_algo(exp_temp, seed=2021)
    _ = algo.fit(data['Y'], T=np.array(data['T']), X=data['X'])
    exp_temp['algo_fitted'] = algo
    results_plot.loc[len(results_plot)] = exp_temp

  0%|          | 0/5 [00:00<?, ?it/s]













In [None]:
results_plot = results_plot.sort_values('epsilon', ascending=False, ignore_index=True)

In [None]:
feature_index = -3
x = np.sort(np.unique(data['X'][:, feature_index]))
x_min, x_max = min(x), 81.5
y_min, y_max = -0.15, 0.15

fig_separate = make_subplots(
    rows=1, cols=len(results_plot['epsilon']),
    subplot_titles=[f'𝜀 = {epsilon}' for epsilon in results_plot['epsilon']])

for index, row in results_plot.iterrows():
    model = row['algo_fitted'].model_cate
    if model.epsilon:
        name = f"𝜀 = {model.epsilon}"
    else:
        name = 'EBM'
    
    ebm_exp = model.explain_global()
    fig = ebm_exp.visualize(feature_index)
    
    x, y = fig.data[1]['x'], fig.data[1]['y']
    fig_separate.add_trace(go.Scatter(x=x, y=y,
                             mode='lines',
                             line={'shape': 'hv', 'width': 2, 
                                  },
                             name=name), row=1, col=index+1)
    fig_separate.update_yaxes(range=[y_min, y_max*1.1], row=1, col=index+1)
    fig_separate.update_xaxes(range=[x_min, x_max], title="Age", row=1, col=index+1)
    fig_separate.update_layout(
                          xaxis_title="Age",
                          yaxis_title="CATE on Voter Turnout",
#                           yaxis_range=[y_min, y_max],
                          font=dict(family="Courier New, monospace",
                                    size=12,
                                    color="RebeccaPurple"
                                ),
                          margin=dict(
                                t=20,
                                b=10,
                                l=10,
                                r=10,
                            ),
                          showlegend=False,
                          autosize=False,
                          width=1000,
                          height=300
    )
    
fig_separate.show()
fig_separate.write_image('plots/f_varying_epsilon.png', height=300, width=1000)

## 6. Exploratory plots

In [None]:
def CATE_histogram(treatment_effect):
    fig = px.histogram(x=treatment_effect)
    fig.update_layout(
        title="CATE histogram",
        xaxis_title="CATE",
        yaxis_title="Counts",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="RebeccaPurple"
        )
    )
    return fig

def scatterplot_CATE_predictedCATE(tau, predicted_tau):
    L, U = min(min(tau), min(predicted_tau)), max(max(tau), max(predicted_tau))
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=tau, y=predicted_tau,
                             mode='markers',
                             marker_size=3,
                             name='predicted'))
    fig.add_trace(go.Scatter(x=[L, U], y=[L, U],
                             mode='lines',
                             name='oracle'))
    fig.update_layout(
        title="CATE - predictedCATE Scatter Plot",
        xaxis_title="CATE",
        yaxis_title="predicted CATE",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="RebeccaPurple"
        )
    )
    return fig

In [None]:
CATE_histogram(data['cate'])

In [None]:
scatterplot_CATE_predictedCATE(data['cate_test'][:1000], 
                               algo.cate_pred(data['X_test'][:1000]))

## NoisySGD [to be filled]

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

import pandas as pd
import numpy as np


class NN_Logistic_Regression(torch.nn.Module):
    def __init__(self, input_dim):
        super(NN_Logistic_Regression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, 1)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
    def forward(self, x):
        output = self.linear(x)
        return torch.sigmoid(output)
    
    def fit(self, X_train, y_train, optimizer, n_epochs, batch_size, checkpoint=10, loss_fn=torch.nn.BCELoss()):
        if isinstance(X_train, np.ndarray):
            X_train = torch.from_numpy(X_train.astype(np.float32))
        if isinstance(y_train, np.ndarray):
            y_train = torch.from_numpy(y_train.reshape(-1, 1).astype(np.float32))
        
        dataset = TensorDataset(X_train, y_train)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
        for epoch in range(n_epochs):
            for batch, (X, y) in enumerate(dataloader):
                X, y = X_train.to(self.device), y_train.to(self.device)

                # Compute prediction error
                pred = self(X)
                loss = loss_fn(pred, y)

                # Backpropagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if epoch % checkpoint == 0:
                print(f"Iteration: {epoch} has loss: {loss:.5f}")

        return self
        
    
class NN_Linear_Regression(torch.nn.Module):
    def __init__(self, input_dim):
        super(NN_Linear_Regression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, 1)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"


    def forward(self, x):
        output = self.linear(x)
        return output
    
    def fit(self, X_train, y_train, optimizer, n_epochs, batch_size, checkpoint=10, loss_fn=torch.nn.MSELoss()):
        if isinstance(X_train, np.ndarray):
            X_train = torch.from_numpy(X_train.astype(np.float32))
        if isinstance(y_train, np.ndarray):
            y_train = torch.from_numpy(y_train.reshape(-1, 1).astype(np.float32))
        
        dataset = TensorDataset(X_train, y_train)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
        for epoch in range(n_epochs):
            for batch, (X, y) in enumerate(dataloader):
                X, y = X_train.to(self.device), y_train.to(self.device)

                # Compute prediction error
                pred = self(X)
                loss = loss_fn(pred, y)

                # Backpropagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if epoch % checkpoint == 0:
                print(f"Iteration: {epoch} has loss: {loss:.5f}")

        return self

In [None]:
# Example Usage for Linear Regression -- similar for Logistic 

from sklearn.datasets import load_breast_cancer, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error

cal = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(cal.data, cal.target, test_size=0.2)


scaler = StandardScaler()

X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

print('Loaded California Housing Regression Dataset!')
print('-'*80)

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f"Linear Regression Test MSE: {mean_squared_error(lr.predict(X_test), y_test):.4f}")

print('-'*80)

BATCH_SIZE = 500
EPOCHS = 50
EPSILON = 100
DELTA = 1e-5
MAX_GRAD_NORM = 1.2

SAMPLE_RATE = BATCH_SIZE / len(X_train)

model = NN_Linear_Regression(X_train.shape[1])
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# DP Section
# from opacus import PrivacyEngine

# privacy_engine = PrivacyEngine(
#     model,
#     sample_rate = SAMPLE_RATE,
#     epochs = EPOCHS,
#     target_epsilon = EPSILON,
#     target_delta = DELTA,
#     max_grad_norm= MAX_GRAD_NORM,
# )
# privacy_engine.attach(optimizer)
model.fit(X_train, y_train, optimizer, EPOCHS, BATCH_SIZE, checkpoint=25)


with torch.no_grad():
    y_pred=model(torch.Tensor(X_test))
    y_pred_class=y_pred.round()
    mse = mean_squared_error(y_test, y_pred_class)
    print(f"NN LR Test MSE: {mse:.4f}")