# DML Diff-in-Diff


In [61]:
import numpy as np
import pandas as pd
import doubleml as dml
import multiprocessing
import copy

from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

## Repeated Outcomes

We will focus on the the DGP1 from [Sant'Anna and Zhao (2020)](https://arxiv.org/pdf/1812.01723.pdf), see Section 4.1 and 4.2.

In [78]:
def f_reg(Z):
  res = 210 + 27.4*Z[:, 0] + 13.7*(Z[:, 1] + Z[:, 2] + Z[:, 3])
  return res

def f_ps(Z):
  res = 0.75*(-Z[:, 0] + 0.5*Z[:, 1] - 0.25*Z[:, 2] - 0.1*Z[:, 3])
  return res

def nu(Z, D):
  res = np.random.normal(loc=D*f_reg(Z), scale=1, size=n)


def dgp(n=200):
  # features
  X = np.random.normal(loc=0, scale=1, size=[n,4])
  Z_tilde_1 = np.exp(0.5*X[:, 0])
  Z_tilde_2 = 10 + X[:, 1] / (1 + np.exp(X[:, 0]))
  Z_tilde_3 = (0.6 + X[:, 0]*X[:, 2]/25)**3
  Z_tilde_4 = (20 + X[:, 1] + X[:, 3])**2

  Z_tilde = np.column_stack((Z_tilde_1, Z_tilde_2, Z_tilde_3, Z_tilde_4))
  Z = (Z_tilde - np.mean(Z_tilde, axis=0)) / np.std(Z_tilde, axis=0)

  # error terms
  epsilon_0 = np.random.normal(loc=0, scale=1, size=n)
  epsilon_1 = np.random.normal(loc=0, scale=1, size=[n,2])

  # treatment and propensities
  p = np.exp(f_ps(Z)) / (1 + np.exp(f_ps(Z)))
  U = np.random.uniform(low=0, high=1, size=n)
  D = 1.0 * (p >= U)
  nu =  np.random.normal(loc=D*f_reg(Z), scale=1, size=n)

  # potential outcomes
  Y0 = f_reg(Z) + nu + epsilon_0
  Y1_d0 = 2*f_reg(Z) + nu + epsilon_1[:, 0]
  Y1_d1 = 2*f_reg(Z) + nu + epsilon_1[:, 1]
  Y1 = D*Y1_d1 + (1-D)*Y1_d0

  return Y0, Y1, D, Z, Y1_d0, Y1_d1


The ATT should be zero:

In [85]:
_, _, D, _, Y1_d0, Y1_d1 = dgp(int(10e+6))

ATT = np.mean(Y1_d1[D == 1] - Y1_d0[D == 1])
print(f'Observed ATT: {ATT}')

Observed ATT: 0.0001247502481387498


In [90]:
n = 1000
np.random.seed(42)
Y0, Y1, D, X, _, _ = dgp(n=n)
Y_diff = Y1 - Y0
obj_dml_data = dml.DoubleMLData.from_arrays(X, Y_diff, D)

In [91]:
ml_g = RandomForestRegressor(n_estimators=100)
ml_m = RandomForestClassifier(n_estimators=100)

dml_DiD = dml.DoubleMLDID(obj_dml_data, ml_g, ml_m,n_folds=5)

dml_DiD.fit()

print(dml_DiD.coef)
dml_DiD.confint(level=0.95)

[-0.15108756]




Unnamed: 0,2.5 %,97.5 %
d,-3.308783,3.006608


# Repeated Cross-sectional Data

Use the same data with coss-sectional observations

In [96]:
lambda_T = 0.5
U_T = np.random.uniform(low=0, high=1, size=n)
T = 1.0 * (U_T <= lambda_T)
Y_obs = T * Y1 + (1-T)*Y0
obj_dml_rcs_data = dml.DoubleMLDIDData.from_arrays(X, Y_obs, D, T)

In [97]:
ml_g = RandomForestRegressor(n_estimators=100)
ml_m = RandomForestClassifier(n_estimators=100)

dml_DiD_RCS = dml.DoubleMLDID(obj_dml_rcs_data, ml_g, ml_m,n_folds=5, score='RCS')

dml_DiD_RCS.fit()

print(dml_DiD_RCS.coef)
dml_DiD_RCS.confint(level=0.95)

[28.47806774]


Unnamed: 0,2.5 %,97.5 %
d,-1.381118,58.337253
