In [127]:
# install your library

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import graphviz as gr
import random

In [128]:
path = "C:/Users/zjy97/Downloads/python-causality-handbook-v1.0/matheusfacure-python-causality-handbook-f666303/causal-inference-for-the-brave-and-true/data/"

In [145]:
# 1. what is matching estimator

# some confounders make the treated and untreated are not initially comparable, they have bias, so we can make them by matching
# each treated unit with a similar untreated unit

# case:
trainee = pd.read_csv(path+'trainees.csv')

# if we assume there is no bias, the mean difference between y0 and y1 and ATE
trainee.query('trainees==1')['earnings'].mean() - trainee.query('trainees==0')['earnings'].mean()# -4297.493734

# see how matched datasets looks like for the first 7 units
unique_on_age = (trainee
                .query('trainees==0')
                .drop_duplicates('age'))
matches = (trainee
          .query('trainees==1')
          .merge(unique_on_age, on='age', how='left', suffixes=('_t_1', '_t_0'))
          .assign(t1_minuts_t0 = lambda d: d["earnings_t_1"] - d["earnings_t_0"]))

# now we can get ATET estimate while controlling for age
matches['t1_minuts_t0'].mean()


# 2. how to run a matching

2457.8947368421054

In [146]:
med = pd.read_csv(path+'medicine_impact_recovery.csv')

# let's try to compute the mean difference between the treatment
med.query('medication==1')['recovery'].mean() - med.query('medication==0')['recovery'].mean()# 16.89

16.895799546498726

In [148]:
# the first step is to respectively run two models for treatment

from sklearn.neighbors import KNeighborsRegressor

treated = med.query('medication==1')
untreated = med.query('medication==0')

X = ['severity','age','sex']
y = 'recovery'

mt0 = KNeighborsRegressor(n_neighbors=1).fit(untreated[X], untreated[y])

mt1 = KNeighborsRegressor(n_neighbors=1).fit(treated[X], treated[y])

predicted = pd.concat([
    # find matches for the treated looking at the untreated knn model
    treated.assign(match=mt0.predict(treated[X])),
    
    # find matches for the untreated looking at the treated knn model
    untreated.assign(match=mt1.predict(untreated[X]))
])

predicted.head()

Unnamed: 0,sex,age,severity,medication,recovery,match
0,0,35.049134,0.887658,1,31,38.0
1,1,41.580323,0.899784,1,49,49.0
7,0,48.616165,0.832912,1,38,45.0
10,1,30.721868,0.626067,1,34,42.0
16,0,32.39288,0.878765,1,30,36.0


In [152]:
# the second step is to use matching estimator formula to compute ATE
# the mean of [(2T-1)*(predicted - actual)]
np.mean((2*predicted["medication"] - 1)*(predicted["recovery"] - predicted["match"]))#1.0707, but it still has matching bias

# matching bias are from the mean difference of y0 when taking or not taking the treatment
# so we need to get rid of it by minusing treated and untreated with their mean(made by ols)

# for example
from sklearn.linear_model import LinearRegression

# fit the linear regression model to estimate mu_0(x)
ols0 = LinearRegression().fit(untreated[X], untreated[y])
ols1 = LinearRegression().fit(treated[X], treated[y])

# find the units that match to the treated
treated_match_index = mt0.kneighbors(treated[X], n_neighbors=1)[1].ravel()

# find the units that match to the untreatd
untreated_match_index = mt1.kneighbors(untreated[X], n_neighbors=1)[1].ravel()

predicted = pd.concat([
    (treated
     # find the Y match on the other group
     .assign(match=mt0.predict(treated[X])) 
     
     # build the bias correction term
     .assign(bias_correct=ols0.predict(treated[X]) - ols0.predict(untreated.iloc[treated_match_index][X]))),
    (untreated
     .assign(match=mt1.predict(untreated[X]))
     .assign(bias_correct=ols1.predict(untreated[X]) - ols1.predict(treated.iloc[untreated_match_index][X])))
])

predicted.head()

Unnamed: 0,sex,age,severity,medication,recovery,match,bias_correct
0,0,35.049134,0.887658,1,31,38.0,8.21267
1,1,41.580323,0.899784,1,49,49.0,19.169009
7,0,48.616165,0.832912,1,38,45.0,8.170534
10,1,30.721868,0.626067,1,34,42.0,3.768854
16,0,32.39288,0.878765,1,30,36.0,9.317305


In [153]:
# 3. why we can use ols for bias of matching

# it doesnt extrapolate on the treatment to get ATE but it is just to correct bias
# it doesnt assume linearity or parametric tests, it is actually a non parametric estimater, so it actually is more
# flexible than linear regression and can work in situations where linear regression will not

# now, with the bias correct, we can get the ATE estimation
np.mean((2*predicted["medication"] - 1)*((predicted["recovery"] - predicted["match"])-predicted["bias_correct"]))# -7.8719

-7.871934286144325

In [158]:
# pip install causalinference

Collecting causalinference
  Downloading CausalInference-0.1.3-py3-none-any.whl (51 kB)
     ---------------------------------------- 0.0/51.1 kB ? eta -:--:--
     ---------------------------------------- 51.1/51.1 kB 2.6 MB/s eta 0:00:00
Installing collected packages: causalinference
Successfully installed causalinference-0.1.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [159]:
# 4. wrap up in a built in causal model for matching

from causalinference import CausalModel

cm = CausalModel(
    Y=med["recovery"].values, 
    D=med["medication"].values, 
    X=med[["severity", "age", "sex"]].values
)

cm.est_via_matching(matches=1, bias_adj=True)

print(cm.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE     -7.709      0.609    -12.649      0.000     -8.903     -6.514
           ATC     -6.665      0.246    -27.047      0.000     -7.148     -6.182
           ATT     -9.679      1.693     -5.717      0.000    -12.997     -6.361



In [None]:
# you can see above the 95% CI is -8.903, -6.514 and we can pick -7.709 for this right now

In [None]:
# 5. why we have matching bias?
# the curse of dimensionality
# we need more data or do PCA, but if we have enough data, there is not much matching bias