In [9]:
from collections import defaultdict
from causal_model import model
from causal_model import graph

We first test the basic functions of several classes.

In [10]:
# input causation structures
g = defaultdict(list)
g['T'] = ['Y', 'X3']
g['X1'] = ['T', 'X2']
g['X2'] = ['Y']

In [11]:
G = graph.CausalGraph(g)

In [12]:
G.is_dag()

True

In [13]:
G.add_edges([
    ('Y', 'X3')]
            )

In [14]:
G.edges

[('Y', 'T'), ('X3', 'T'), ('T', 'X1'), ('X2', 'X1'), ('Y', 'X2'), ('Y', 'X3')]

In [15]:
G.to_adj_matrix()

matrix([[0., 1., 1., 0., 1.],
        [0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.]])

In [16]:
cmodel = model.CausalModel(G, estimation=('LR', 'COM'))

KeyError: 'estimator_model'

In [None]:
set = cmodel.identify('T', 'Y')

The corresponding statistical estimand should be P(Y|T, X3)


In [None]:
set

['X3']

We now try to run a complete example.

In [None]:
import pandas as pd
import numpy as np

In [None]:
np.random.seed(2333)

In [None]:
n_users = 10000
def gen_data(n_users, binary_treatment=True, with_income=False):
    
    if with_income:
        income = np.random.normal(500, scale=15, size=n_users)
        gender = np.random.randint(0, 2, size=n_users)
        coupon = gender * 20 + 110 + income / 50 + np.random.normal(scale=5, size=n_users)
        if binary_treatment:
            coupon = (coupon > 120).astype(int)
        amount = coupon * 150 + gender * 100 + 150 + income / 5 + np.random.normal(size=n_users)
        time_spent = coupon * 10 + amount / 10

        df = pd.DataFrame({
            'gender': gender,
            'coupon': coupon,
            'amount': amount,
            'income': income,
            'time_spent': time_spent,
        })

        return df
    else:
        gender = np.random.randint(0, 2, size=n_users)
        coupon = gender * 20 + 150 + np.random.normal(scale=5, size=n_users)
        if binary_treatment:
            coupon = (coupon > 150).astype(int)
        amount = coupon * 30 + gender * 100 + 150 + np.random.normal(size=n_users)
        time_spent = coupon * 100 + amount / 10

        df = pd.DataFrame({
            'gender': gender,
            'coupon': coupon,
            'amount': amount,
            'time_spent': time_spent,
        })
        
        return df

df = gen_data(n_users, with_income=True)
df['treatment'] = df['coupon'] > 0
df

Unnamed: 0,gender,coupon,amount,income,time_spent,treatment
0,1,1,503.431387,514.476454,60.343139,True
1,1,1,499.607593,501.092746,59.960759,True
2,1,1,498.477904,501.318961,59.847790,True
3,0,0,250.559512,499.204035,25.055951,False
4,0,1,400.021093,493.324142,50.002109,True
...,...,...,...,...,...,...
9995,1,1,500.072047,497.626099,60.007205,True
9996,0,0,249.039924,496.773953,24.903992,False
9997,0,1,398.972469,506.625278,49.897247,True
9998,1,1,499.466652,494.572779,59.946665,True


In [None]:
# naive estimation
te = (df['coupon'] > 0)
ts = (df['coupon'] == 0)
np.mean(df.loc[te, 'amount']) - np.mean(df.loc[ts, 'amount'])

216.63817932298377

In [None]:
df.coupon.value_counts()

1    7494
0    2506
Name: coupon, dtype: int64

In [None]:
from collections import defaultdict
gx = defaultdict(list)
gx['coupon'] = ['gender', 'income']
gx['gender'] = []
gx['amount'] = ['coupon', 'income']
gx['income'] = []
gx['time_spent'] = ['coupon', 'amount']

In [None]:
Gx = graph.CausalGraph(gx)

In [None]:
Gx.is_dag()

True

In [None]:
Gx.edges

[('gender', 'coupon'),
 ('income', 'coupon'),
 ('coupon', 'amount'),
 ('income', 'amount'),
 ('coupon', 'time_spent'),
 ('amount', 'time_spent')]

In [None]:
Gx.to_adj_matrix()

matrix([[0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 1.],
        [0., 1., 0., 1., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0.]])

In [None]:
Cmodel = model.CausalModel(Gx, estimation=('LR', 'COM'))

In [None]:
adjustment_set = Cmodel.identify('coupon', 'amount')

The corresponding statistical estimand should be P(amount|coupon, gender, income)


In [None]:
adjustment_set

['gender', 'income']

In [None]:
X = df.drop(['amount', 'treatment'], axis=1)
X

Unnamed: 0,gender,coupon,income,time_spent
0,1,1,514.476454,60.343139
1,1,1,501.092746,59.960759
2,1,1,501.318961,59.847790
3,0,0,499.204035,25.055951
4,0,1,493.324142,50.002109
...,...,...,...,...
9995,1,1,497.626099,60.007205
9996,0,0,496.773953,24.903992
9997,0,1,506.625278,49.897247
9998,1,1,494.572779,59.946665


In [None]:
y = df['amount']

In [None]:
Cmodel.estimate(X, y,'coupon', adjustment_set)

149.98329515879777