In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
NUM_DATA_POINTS = 100000

# Introduction

![figures/identification_estimation_flowchart.png](figures/identification_estimation_flowchart.png)

Causal estimand: Average treatment effect (ATE): E[Y(1) - Y(0)] = E[Y|do(T=1)] - E[Y|do(T=0)]

Causal Outcome Modeling (COM) estimation.

# Causal Model

## Graphical Model (DAG)

![figures/dag_1.png](figures/dag_1.png)

## Structural Causal Model (SCM)

## Data generating process

In [2]:
def f_x():
    u_x = np.random.choice([0, 1])
    return u_x

def f_t(x):
    u_t = np.random.normal()
    intermediate = 0.5 * x + u_t
    return 1 if intermediate > 0 else 0

def f_y(x, t):
    u_y = np.random.normal()
    return 0.8 * x + 1.2 * t + u_y

X = np.array([f_x() for _ in range(NUM_DATA_POINTS)])
T = np.array([f_t(x) for x in X])
Y = np.array([f_y(x, t) for x, t in zip(X, T)])

# Identification

Backdoor criterion

Backdoor adjustment

# Estimation

Linear model

In [3]:
XT = np.array([X, T]).T
XT_1 = np.array([X, np.ones(len(X))]).T
XT_0 = np.array([X, np.zeros(len(X))]).T
model = LinearRegression()
model.fit(XT, Y)
ate_estimate = np.mean(model.predict(XT_1) - model.predict(XT_0))
print("ATE estimate:", ate_estimate)

ATE estimate: 1.1954780391658062


In [4]:
print("ATE estimate:", model.coef_[1])

ATE estimate: 1.195478039165806


# Colliders

![figures/dag_2.png](figures/dag_2.png)

In [5]:
def f_x():
    u_x = np.random.choice([0, 1])
    return u_x

def f_t(x):
    u_t = np.random.normal()
    intermediate = 0.5 * x + u_t
    return 1 if intermediate > 0 else 0

def f_y(x, t):
    u_y = np.random.normal()
    return 0.8 * x + 1.2 * t + u_y

def f_z(t, y):
    u_z = np.random.normal()
    intermediate = 1.5 * t + y - 2 + u_z
    return 1 if intermediate > 0 else 0

X = np.array([f_x() for _ in range(NUM_DATA_POINTS)])
T = np.array([f_t(x) for x in X])
Y = np.array([f_y(x, t) for x, t in zip(X, T)])
Z = np.array([f_z(t, y) for t, y in zip(T, Y)])

In [6]:
XZT = np.array([X, Z, T]).T
XZT_1 = np.array([X, Z, np.ones(len(X))]).T
XZT_0 = np.array([X, Z, np.zeros(len(X))]).T
model = LinearRegression()
model.fit(XZT, Y)
ate_estimate = np.mean(model.predict(XZT_1) - model.predict(XZT_0))
print("ATE estimate:", ate_estimate)

ATE estimate: 0.4157800161715359


In [7]:
XT = np.array([X, T]).T
XT_1 = np.array([X, np.ones(len(X))]).T
XT_0 = np.array([X, np.zeros(len(X))]).T
model = LinearRegression()
model.fit(XT, Y)
ate_estimate = np.mean(model.predict(XT_1) - model.predict(XT_0))
print("ATE estimate:", ate_estimate)

ATE estimate: 1.1995390694286208


# Unobserved confounding

![figures/dag_3.png](figures/dag_3.png)

In [8]:
def f_x():
    u_x = np.random.choice([0, 1])
    return u_x

def f_u():
    u_u = np.random.normal()
    return u_u

def f_t(x, u):
    u_t = np.random.normal()
    intermediate = 0.5 * x + 0.8 * u + u_t
    return 1 if intermediate > 0 else 0

def f_y(x, u, t):
    u_y = np.random.normal()
    return 0.8 * x - 2 * u + 1.2 * t + u_y

X = np.array([f_x() for _ in range(NUM_DATA_POINTS)])
U = np.array([f_u() for _ in range(NUM_DATA_POINTS)])
T = np.array([f_t(x, u) for x, u in zip(X, U)])
Y = np.array([f_y(x, u, t) for x, u, t in zip(X, U, T)])

In [9]:
XT = np.array([X, T]).T
XT_1 = np.array([X, np.ones(len(X))]).T
XT_0 = np.array([X, np.zeros(len(X))]).T
model = LinearRegression()
model.fit(XT, Y)
ate_estimate = np.mean(model.predict(XT_1) - model.predict(XT_0))
print("ATE estimate:", ate_estimate)

ATE estimate: -0.793921635724498


## Sensitivity analysis

# Conclusion

TODO

Just a small part of causal inference