# url: https://qiita.com/nekoumei/items/648726e89d05cba6f432

In [1]:
import pandas as pd
import os
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from tqdm import tqdm_notebook
import plotly.express as px
import plotly.graph_objects as go
import random
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## (3) データの読み込み

In [2]:
dumped_male_df_path = '/content/male_df.joblib'
dumped_biased_df_path = '/content/biased_df.joblib'

if os.path.exists(dumped_male_df_path):
    male_df = joblib.load(dumped_male_df_path)
    biased_df = joblib.load(dumped_biased_df_path)
else:
    # セレクションバイアスのあるデータの作成
    mail_df = pd.read_csv('http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv')
    ### 女性向けメールが配信されたデータを削除したデータを作成
    male_df = mail_df[mail_df.segment != 'Womens E-Mail'].copy() # 女性向けメールが配信されたデータを削除
    male_df['treatment'] = male_df.segment.apply(lambda x: 1 if x == 'Mens E-Mail' else 0) #介入を表すtreatment変数を追加
    ## バイアスのあるデータの作成
    sample_rules = (male_df.history > 300) | (male_df.recency < 6) | (male_df.channel=='Multichannel')
    biased_df = pd.concat([
        male_df[(sample_rules) & (male_df.treatment == 0)].sample(frac=0.5, random_state=1),
        male_df[(sample_rules) & (male_df.treatment == 1)],
        male_df[(~sample_rules) & (male_df.treatment == 0)],
        male_df[(~sample_rules) & (male_df.treatment == 1)].sample(frac=0.5, random_state=1)
    ], axis=0, ignore_index=True)

## (6) 傾向スコアの推定


In [3]:
y = biased_df['treatment']
X = pd.get_dummies(biased_df[['recency', 'channel', 'history']], columns=['channel'], drop_first=True)

ps_model = LogisticRegression(solver='lbfgs').fit(X, y)

## (7) 傾向スコアマッチング

### 番外編：確認のため、MatchItによるマッチング結果を見る
事前準備
```
sudo R
>install.packages("MatchIt")
>install.packages("Matching")
```

In [4]:
# from rpy2.robjects import r, pandas2ri, globalenv
# from rpy2.robjects.packages import importr
# pandas2ri.activate()
# matchit = importr('MatchIt')

In [5]:
# r_biased_df = pandas2ri.py2rpy(biased_df)
# globalenv['r_biased_df'] = r_biased_df

In [6]:
# r('set.seed(1)')

In [7]:
# m_near = r('m_near <- matchit(formula = treatment ~ recency + history + channel,data = r_biased_df,method = "nearest",replace = TRUE)')

In [8]:
# matched_data = r('matched_data <- match.data(m_near)')

In [9]:
# matched_data.shape

In [10]:
# biased_df.treatment.value_counts()

マッチング後の件数が、treatment = 0　の件数 * 2より少ないので、おそらく一定のしきい値を設けて近傍点をマッチングしている

In [11]:
# ## マッチング後のデータで効果の推定
# y = matched_data.spend
# X = matched_data.treatment
# X = sm.add_constant(X)
# results = sm.OLS(y, X).fit()
# coef = results.summary().tables[1]
# coef

### 追加実験：seedを変えて10回実行し、treatmentのcoefの分布を確認する
std err分かってるから不要だったかも

In [12]:
# coefs = []
# for i in tqdm_notebook(range(10)):
#     r(f'set.seed({i})')
#     m_near = r('m_near <- matchit(formula = treatment ~ recency + history + channel,data = r_biased_df,method = "nearest",replace = TRUE)')
#     matched_data = r('matched_data <- match.data(m_near)')
#     ## マッチング後のデータで効果の推定
#     y = matched_data.spend
#     X = matched_data.treatment
#     X = sm.add_constant(X)
#     results = sm.OLS(y, X).fit()
#     coef = results.params.loc['treatment']
#     print(f'iter: {i}, coef: {coef}')
#     coefs.append(coef)

In [13]:
# fig = px.violin(pd.DataFrame({'coef': coefs}), y='coef', box=True, points='all',
#                 title='MatchItによる傾向スコアマッチング後のtreatmentの効果分布')
# fig.show()

In [14]:
# fig.write_html('ch3_plot0.html', auto_open=False)

### Pythonでの実装

In [15]:
y = biased_df['treatment']
X = pd.get_dummies(biased_df[['recency', 'channel', 'history']], columns=['channel'], drop_first=True)

In [16]:
X

Unnamed: 0,recency,history,channel_Phone,channel_Web
0,8,572.65,0,1
1,5,42.38,1,0
2,1,3003.48,1,0
3,1,662.10,0,1
4,5,44.37,0,1
...,...,...,...,...
31920,12,29.99,0,1
31921,6,156.37,0,1
31922,11,62.56,1,0
31923,11,149.71,1,0


In [17]:
def get_matched_dfs_using_propensity_score(X, y, random_state=0):
    # 傾向スコアを計算する
    ps_model = LogisticRegression(solver='lbfgs', random_state=random_state).fit(X, y)
    ps_score = ps_model.predict_proba(X)[:, 1]
    all_df = pd.DataFrame({'treatment': y, 'ps_score': ps_score})
    treatments = all_df.treatment.unique()
    if len(treatments) != 2:
        print('2群のマッチングしかできません。2群は必ず[0, 1]で表現してください。')
        raise ValueError
    # treatment == 1をgroup1, treatment == 0をgroup2とする。group1にマッチするgroup2を抽出するのでATTの推定になるはず
    group1_df = all_df[all_df.treatment==1].copy()
    group1_indices = group1_df.index
    group1_df = group1_df.reset_index(drop=True)
    group2_df = all_df[all_df.treatment==0].copy()
    group2_indices = group2_df.index
    group2_df = group2_df.reset_index(drop=True)

    # 全体の傾向スコアの標準偏差 * 0.2をしきい値とする
    threshold = all_df.ps_score.std() * 0.2

    matched_group1_dfs = []
    matched_group2_dfs = []
    _group1_df = group1_df.copy()
    _group2_df = group2_df.copy()

    while True:
        # NearestNeighborsで最近傍点1点を見つけ、マッチングする
        neigh = NearestNeighbors(n_neighbors=1)
        neigh.fit(_group1_df.ps_score.values.reshape(-1, 1))
        distances, indices = neigh.kneighbors(_group2_df.ps_score.values.reshape(-1, 1))
        # 重複点を削除する
        distance_df = pd.DataFrame({'distance': distances.reshape(-1), 'indices': indices.reshape(-1)})
        distance_df.index = _group2_df.index
        distance_df = distance_df.drop_duplicates(subset='indices')
        # しきい値を超えたレコードを削除する
        distance_df = distance_df[distance_df.distance < threshold]
        if len(distance_df) == 0:
            break
        # マッチングしたレコードを抽出、削除する
        group1_matched_indices = _group1_df.iloc[distance_df['indices']].index.tolist()
        group2_matched_indices = distance_df.index
        matched_group1_dfs.append(_group1_df.loc[group1_matched_indices])
        matched_group2_dfs.append(_group2_df.loc[group2_matched_indices])
        _group1_df = _group1_df.drop(group1_matched_indices)
        _group2_df = _group2_df.drop(group2_matched_indices)

    # マッチしたレコードを返す
    group1_df.index = group1_indices
    group2_df.index = group2_indices
    matched_df = pd.concat([
        group1_df.iloc[pd.concat(matched_group1_dfs).index],
        group2_df.iloc[pd.concat(matched_group2_dfs).index]
    ]).sort_index()
    matched_indices = matched_df.index

    return X.loc[matched_indices], y.loc[matched_indices]

In [18]:
matchX, matchy = get_matched_dfs_using_propensity_score(X, y)

In [19]:
matchX

Unnamed: 0,recency,history,channel_Phone,channel_Web
0,8,572.65,0,1
1,5,42.38,1,0
2,1,3003.48,1,0
3,1,662.10,0,1
4,5,44.37,0,1
...,...,...,...,...
31920,12,29.99,0,1
31921,6,156.37,0,1
31922,11,62.56,1,0
31923,11,149.71,1,0


In [20]:
## マッチング後のデータで効果の推定
y = biased_df.loc[matchX.index].spend
X = matchy
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
coef = results.summary().tables[1]
coef

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6214,0.143,4.342,0.000,0.341,0.902
treatment,0.7940,0.202,3.923,0.000,0.397,1.191


## seedを変えて10回実行し、treatmentのcoefの分布を確認する

In [21]:
coefs_py = []
for i in tqdm_notebook(range(10)):
    random.seed(i)
    os.environ['PYTHONHASHSEED'] = str(i)
    np.random.seed(i)
    y = biased_df['treatment']
    X = pd.get_dummies(biased_df[['recency', 'channel', 'history']], columns=['channel'], drop_first=True)
    matchX, matchy = get_matched_dfs_using_propensity_score(X, y, random_state=i)
    y = biased_df.loc[matchX.index].spend
    X = matchy
    X = sm.add_constant(X)
    results = sm.OLS(y, X).fit()
    coef = results.params.loc['treatment']
    print(f'iter: {i}, coef: {coef}')
    coefs_py.append(coef)

  0%|          | 0/10 [00:00<?, ?it/s]

iter: 0, coef: 0.7940240711669371
iter: 1, coef: 0.7940240711669371
iter: 2, coef: 0.7940240711669371
iter: 3, coef: 0.7940240711669371
iter: 4, coef: 0.7940240711669371
iter: 5, coef: 0.7940240711669371
iter: 6, coef: 0.7940240711669371
iter: 7, coef: 0.7940240711669371
iter: 8, coef: 0.7940240711669371
iter: 9, coef: 0.7940240711669371


seedを変えても結果は変わらず。。。  
MatchItの結果と比較してやや下振れだが、0.8088+-0.203のためMatchItでの計算結果と大きな乖離があるとは言えない

## (8) 逆確率重み付き推定（IPW）

### 番外編：確認のため、WeightItによる重み付け後の結果を見る
事前準備
```
sudo R
>install.packages("WeightIt")
```

In [22]:
# pandas2ri.activate()
# weightit = importr('WeightIt')
# r_biased_df = pandas2ri.py2rpy(biased_df)
# globalenv['r_biased_df'] = r_biased_df
# r('set.seed(1)')
# ## 重みの推定
# weighting = r('weighting <- weightit(treatment ~ recency + history + channel,data = r_biased_df,method = "ps",estimand = "ATE")')

In [23]:
# weighting[0]

In [24]:
# ## 重み付きデータでの効果の推定
# y = biased_df.spend
# X = biased_df.treatment
# X = sm.add_constant(X)
# results = sm.WLS(y, X, weights=weighting[0]).fit()
# coef = results.summary().tables[1]
# coef

## Pythonで実装

In [25]:
def get_ipw(X, y, random_state=0):
    # 傾向スコアを計算する
    ps_model = LogisticRegression(solver='lbfgs', random_state=random_state).fit(X, y)
    ps_score = ps_model.predict_proba(X)[:, 1]
    all_df = pd.DataFrame({'treatment': y, 'ps_score': ps_score})
    treatments = all_df.treatment.unique()
    if len(treatments) != 2:
        print('2群のマッチングしかできません。2群は必ず[0, 1]で表現してください。')
        raise ValueError
    # treatment == 1をgroup1, treatment == 0をgroup2とする。
    group1_df = all_df[all_df.treatment==1].copy()
    group2_df = all_df[all_df.treatment==0].copy()
    group1_df['weight'] = 1 / group1_df.ps_score
    group2_df['weight'] = 1 / (1 - group2_df.ps_score)
    weights = pd.concat([group1_df, group2_df]).sort_index()['weight'].values
    return weights

In [26]:
y = biased_df['treatment']
X = pd.get_dummies(biased_df[['recency', 'channel', 'history']], columns=['channel'], drop_first=True)
weights = get_ipw(X, y)

In [27]:
weights

array([ 2.04325576,  2.10624393, 11.44468108, ...,  2.86303711,
        2.76924805,  2.95525343])

WeightItの結果と大体あってそう

In [28]:
## 重み付きデータでの効果の推定
y = biased_df.spend
X = biased_df.treatment
X = sm.add_constant(X)
results = sm.WLS(y, X, weights=weights).fit()
coef = results.summary().tables[1]
coef

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5903,0.120,4.920,0.000,0.355,0.825
treatment,0.8856,0.170,5.203,0.000,0.552,1.219


## (9) 共変量のバランスを確認

In [29]:
def calc_absolute_mean_difference(df):
    # (treatment群の平均 - control群の平均) / 全体の標準誤差
    return ((df[df.treatment==1].drop('treatment', axis=1).mean() - df[df.treatment==0].drop('treatment', axis=1).mean()) \
            / df.drop('treatment', axis=1).std()).abs()

## 調整前のAbsolute Mean Difference
unadjusted_df = pd.get_dummies(biased_df[['treatment', 'recency', 'channel', 'history']], columns=['channel'])
unadjusted_amd = calc_absolute_mean_difference(unadjusted_df)

# 傾向スコアマッチング後のAbusolute Mean Difference
after_matching_df = pd.get_dummies(biased_df.loc[matchX.index][['treatment', 'recency', 'history', 'channel']], columns=['channel'])
after_matching_amd = calc_absolute_mean_difference(after_matching_df)

# IPWで重み付け後のAbusolute Mean Difference
# 重みのぶんレコードを増やして計算する（もっといいやり方を知りたい）
after_weighted_df = pd.get_dummies(biased_df[['treatment', 'recency', 'channel', 'history']], columns=['channel'])
weights_int = (weights * 100).astype(int)
weighted_df = []
for i, value in enumerate(after_weighted_df.values):
    weighted_df.append(np.tile(value, (weights_int[i], 1)))
weighted_df = np.concatenate(weighted_df).reshape(-1, 6)
weighted_df = pd.DataFrame(weighted_df)
weighted_df.columns = after_weighted_df.columns
after_weighted_amd = calc_absolute_mean_difference(weighted_df)

In [30]:
after_weighted_df

Unnamed: 0,treatment,recency,history,channel_Multichannel,channel_Phone,channel_Web
0,0,8,572.65,0,0,1
1,0,5,42.38,0,1,0
2,0,1,3003.48,0,1,0
3,0,1,662.10,0,0,1
4,0,5,44.37,0,0,1
...,...,...,...,...,...,...
31920,1,12,29.99,0,0,1
31921,1,6,156.37,0,0,1
31922,1,11,62.56,0,1,0
31923,1,11,149.71,0,1,0


In [31]:
weighted_df

Unnamed: 0,treatment,recency,history,channel_Multichannel,channel_Phone,channel_Web
0,0.0,8.0,572.65,0.0,0.0,1.0
1,0.0,8.0,572.65,0.0,0.0,1.0
2,0.0,8.0,572.65,0.0,0.0,1.0
3,0.0,8.0,572.65,0.0,0.0,1.0
4,0.0,8.0,572.65,0.0,0.0,1.0
...,...,...,...,...,...,...
6387145,1.0,12.0,187.77,0.0,1.0,0.0
6387146,1.0,12.0,187.77,0.0,1.0,0.0
6387147,1.0,12.0,187.77,0.0,1.0,0.0
6387148,1.0,12.0,187.77,0.0,1.0,0.0


### マッチングしたデータでの共変量のバランス

In [32]:
balance_df = pd.concat([
    pd.DataFrame({'Absolute Mean Difference': unadjusted_amd, 'Sample': 'Unadjusted'}),
    pd.DataFrame({'Absolute Mean Difference': after_matching_amd, 'Sample': 'Adjusted'})
])

fig = px.scatter(balance_df, x='Absolute Mean Difference', y=balance_df.index, color='Sample',
                title='3.5 マッチングしたデータでの共変量のバランス')
fig.show()

In [33]:
fig.write_html('ch3_plot1.html', auto_open=False)

### 重み付きデータでの共変量のバランス

In [34]:
balance_df = pd.concat([
    pd.DataFrame({'Absolute Mean Difference': unadjusted_amd, 'Sample': 'Unadjusted'}),
    pd.DataFrame({'Absolute Mean Difference': after_weighted_amd, 'Sample': 'Adjusted'})
])

fig = px.scatter(balance_df, x='Absolute Mean Difference', y=balance_df.index, color='Sample',
                title='重み付けしたデータでの共変量のバランス')
fig.show()

In [35]:
fig.write_html('ch3_plot2.html', auto_open=False)

## (10) 統計モデルを用いたメールの配信のログを分析

In [36]:
random_state = 0

In [37]:
## 学習データと配信ログを作るデータに分割
male_df_train, male_df_test = train_test_split(male_df, test_size=0.5, random_state=random_state)
male_df_train = male_df_train[male_df_train.treatment == 0]

In [38]:
male_df_train

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
10237,7,3) $200 - $350,202.11,0,1,Surburban,0,Multichannel,No E-Mail,1,0,0.0,0
25139,10,1) $0 - $100,65.94,0,1,Urban,0,Web,No E-Mail,0,0,0.0,0
50898,2,1) $0 - $100,29.99,0,1,Urban,0,Web,No E-Mail,0,0,0.0,0
45472,8,3) $200 - $350,252.40,1,1,Urban,0,Web,No E-Mail,1,0,0.0,0
1969,1,4) $350 - $500,494.47,0,1,Urban,1,Multichannel,No E-Mail,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22501,8,4) $350 - $500,418.12,1,0,Urban,0,Multichannel,No E-Mail,0,0,0.0,0
63088,8,2) $100 - $200,185.73,1,0,Surburban,1,Phone,No E-Mail,0,0,0.0,0
45716,10,"6) $750 - $1,000",807.39,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0
31973,11,1) $0 - $100,83.91,1,0,Surburban,0,Web,No E-Mail,0,0,0.0,0


In [39]:
male_df_test

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
38852,11,1) $0 - $100,94.30,1,0,Surburban,1,Web,No E-Mail,0,0,0.0,0
32024,6,1) $0 - $100,69.88,0,1,Urban,1,Phone,Mens E-Mail,1,0,0.0,1
43984,10,"6) $750 - $1,000",887.06,1,0,Surburban,1,Multichannel,Mens E-Mail,0,0,0.0,1
30884,8,3) $200 - $350,200.84,1,0,Urban,1,Phone,Mens E-Mail,0,0,0.0,1
9518,1,3) $200 - $350,228.80,0,1,Surburban,0,Multichannel,Mens E-Mail,0,0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55471,10,1) $0 - $100,29.99,1,0,Urban,0,Phone,Mens E-Mail,0,0,0.0,1
27572,3,3) $200 - $350,205.93,0,1,Surburban,0,Phone,Mens E-Mail,1,0,0.0,1
53413,3,3) $200 - $350,204.46,0,1,Urban,0,Phone,Mens E-Mail,0,0,0.0,1
12251,10,1) $0 - $100,87.99,0,1,Urban,0,Web,No E-Mail,0,0,0.0,0


In [40]:
## 売上が発生する確率を予測するモデルを作成
model = LogisticRegression(random_state=random_state)

y_train = male_df_train['conversion']

X_train = pd.get_dummies(
    male_df_train[['recency', 'history_segment', 'channel', 'zip_code']], columns=['history_segment', 'channel', 'zip_code'], drop_first=True
)

X_test = pd.get_dummies(
    male_df_test[['recency', 'history_segment', 'channel', 'zip_code']], columns=['history_segment', 'channel', 'zip_code'], drop_first=True
)

model.fit(X_train, y_train)

## 売上の発生確率からメールの配信確率を決める
pred_cv = model.predict_proba(X_test)[:, 1]
pred_cv_rank = pd.Series(pred_cv, name='proba').rank(pct=True)

## 配信確率を元にメールの配信を決める
mail_assign = pred_cv_rank.apply(lambda x: np.random.binomial(n=1, p=x))

In [41]:
## 配信ログを作成
male_df_test['mail_assign'] = mail_assign
male_df_test['ps'] = pred_cv_rank

ml_male_df = male_df_test[
    ((male_df_test.treatment == 1) & (male_df_test.mail_assign == 1)) |
    ((male_df_test.treatment == 0) & (male_df_test.mail_assign == 0))
].copy()

In [42]:
male_df_test

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,mail_assign,ps
38852,11,1) $0 - $100,94.30,1,0,Surburban,1,Web,No E-Mail,0,0,0.0,0,,
32024,6,1) $0 - $100,69.88,0,1,Urban,1,Phone,Mens E-Mail,1,0,0.0,1,,
43984,10,"6) $750 - $1,000",887.06,1,0,Surburban,1,Multichannel,Mens E-Mail,0,0,0.0,1,,
30884,8,3) $200 - $350,200.84,1,0,Urban,1,Phone,Mens E-Mail,0,0,0.0,1,,
9518,1,3) $200 - $350,228.80,0,1,Surburban,0,Multichannel,Mens E-Mail,0,0,0.0,1,1.0,0.895950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55471,10,1) $0 - $100,29.99,1,0,Urban,0,Phone,Mens E-Mail,0,0,0.0,1,,
27572,3,3) $200 - $350,205.93,0,1,Surburban,0,Phone,Mens E-Mail,1,0,0.0,1,,
53413,3,3) $200 - $350,204.46,0,1,Urban,0,Phone,Mens E-Mail,0,0,0.0,1,,
12251,10,1) $0 - $100,87.99,0,1,Urban,0,Web,No E-Mail,0,0,0.0,0,0.0,0.390013


In [43]:
ml_male_df

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,mail_assign,ps
9518,1,3) $200 - $350,228.80,0,1,Surburban,0,Multichannel,Mens E-Mail,0,0,0.0,1,1.0,0.895950
17644,6,1) $0 - $100,50.49,1,0,Surburban,1,Web,No E-Mail,0,0,0.0,0,0.0,0.082837
17686,10,1) $0 - $100,43.94,0,1,Urban,1,Web,No E-Mail,0,0,0.0,0,0.0,0.372788
7830,10,"7) $1,000 +",1117.16,1,1,Surburban,1,Multichannel,Mens E-Mail,1,0,0.0,1,1.0,0.938471
5621,2,5) $500 - $750,693.62,0,1,Surburban,1,Phone,Mens E-Mail,0,0,0.0,1,1.0,0.099216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4141,1,1) $0 - $100,29.99,1,0,Urban,0,Phone,Mens E-Mail,0,0,0.0,1,1.0,0.742854
7324,10,1) $0 - $100,88.04,1,0,Surburban,0,Phone,Mens E-Mail,0,0,0.0,1,1.0,0.676163
2463,10,1) $0 - $100,40.95,0,1,Surburban,0,Phone,Mens E-Mail,0,0,0.0,1,1.0,0.488642
20514,1,3) $200 - $350,229.55,0,1,Surburban,0,Web,Mens E-Mail,0,0,0.0,1,1.0,0.859647


In [44]:
## 平均の比較
## 実験をしていた場合の平均の差を確認
y = male_df_test.spend
X = male_df_test.treatment
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
coef = results.summary().tables[1]
coef

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5585,0.149,3.752,0.000,0.267,0.850
treatment,0.9251,0.211,4.389,0.000,0.512,1.338


RCTを行っていた場合は$0.925の売上増加が期待できる

In [45]:
## セレクションバイアスの影響を受けている平均の比較
y = ml_male_df.spend
X = ml_male_df.treatment
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
coef = results.summary().tables[1]
coef

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4850,0.337,1.440,0.150,-0.175,1.145
treatment,0.9733,0.480,2.026,0.043,0.031,1.915


### 傾向スコアマッチングの推定(TPS)

In [46]:
def get_matched_dfs_using_obtained_propensity_score(X, y, ps_score, random_state=0):
    all_df = pd.DataFrame({'treatment': y, 'ps_score': ps_score})
    treatments = all_df.treatment.unique()
    if len(treatments) != 2:
        print('2群のマッチングしかできません。2群は必ず[0, 1]で表現してください。')
        raise ValueError
    # treatment == 1をgroup1, treatment == 0をgroup2とする。group1にマッチするgroup2を抽出するのでATTの推定になるはず
    group1_df = all_df[all_df.treatment==1].copy()
    group1_indices = group1_df.index
    group1_df = group1_df.reset_index(drop=True)
    group2_df = all_df[all_df.treatment==0].copy()
    group2_indices = group2_df.index
    group2_df = group2_df.reset_index(drop=True)

    # 全体の傾向スコアの標準偏差 * 0.2をしきい値とする
    threshold = all_df.ps_score.std() * 0.2

    matched_group1_dfs = []
    matched_group2_dfs = []
    _group1_df = group1_df.copy()
    _group2_df = group2_df.copy()

    while True:
        # NearestNeighborsで最近傍点1点を見つけ、マッチングする
        neigh = NearestNeighbors(n_neighbors=1)
        neigh.fit(_group1_df.ps_score.values.reshape(-1, 1))
        distances, indices = neigh.kneighbors(_group2_df.ps_score.values.reshape(-1, 1))
        # 重複点を削除する
        distance_df = pd.DataFrame({'distance': distances.reshape(-1), 'indices': indices.reshape(-1)})
        distance_df.index = _group2_df.index
        distance_df = distance_df.drop_duplicates(subset='indices')
        # しきい値を超えたレコードを削除する
        distance_df = distance_df[distance_df.distance < threshold]
        if len(distance_df) == 0:
            break
        # マッチングしたレコードを抽出、削除する
        group1_matched_indices = _group1_df.iloc[distance_df['indices']].index.tolist()
        group2_matched_indices = distance_df.index
        matched_group1_dfs.append(_group1_df.loc[group1_matched_indices])
        matched_group2_dfs.append(_group2_df.loc[group2_matched_indices])
        _group1_df = _group1_df.drop(group1_matched_indices)
        _group2_df = _group2_df.drop(group2_matched_indices)

    # マッチしたレコードを返す
    group1_df.index = group1_indices
    group2_df.index = group2_indices
    matched_df = pd.concat([
        group1_df.iloc[pd.concat(matched_group1_dfs).index],
        group2_df.iloc[pd.concat(matched_group2_dfs).index]
    ]).sort_index()
    matched_indices = matched_df.index

    return X.loc[matched_indices], y.loc[matched_indices]

In [47]:
matchX, matchy = get_matched_dfs_using_obtained_propensity_score(ml_male_df, ml_male_df.treatment, ps_score=ml_male_df.ps)

In [48]:
## マッチング後のデータで効果の推定
y = matchX.spend
X = matchy
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
coef = results.summary().tables[1]
coef

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3307,0.413,0.800,0.424,-0.479,1.141
treatment,1.4101,0.584,2.414,0.016,0.264,2.556


### IPWの推定

In [49]:
def get_ipw_obtained_ps(X, y, ps_score, random_state=0):
    all_df = pd.DataFrame({'treatment': y, 'ps_score': ps_score})
    treatments = all_df.treatment.unique()
    if len(treatments) != 2:
        print('2群のマッチングしかできません。2群は必ず[0, 1]で表現してください。')
        raise ValueError
    # treatment == 1をgroup1, treatment == 0をgroup2とする。
    group1_df = all_df[all_df.treatment==1].copy()
    group2_df = all_df[all_df.treatment==0].copy()
    group1_df['weight'] = 1 / group1_df.ps_score
    group2_df['weight'] = 1 / (1 - group2_df.ps_score)
    weights = pd.concat([group1_df, group2_df]).sort_index()['weight'].values
    return weights

In [50]:
weights = get_ipw_obtained_ps(ml_male_df, ml_male_df.treatment, ps_score=ml_male_df.ps)
## 重み付きデータでの効果の推定
y = ml_male_df.spend
X = ml_male_df.treatment
X = sm.add_constant(X)
results = sm.WLS(y, X, weights=weights).fit()
coef = results.summary().tables[1]
coef

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5067,0.319,1.588,0.112,-0.119,1.132
treatment,0.7484,0.454,1.650,0.099,-0.141,1.638


In [51]:
# RCTの効果と傾向スコアマッチング後の効果の差分
0.9251 -1.0771

-0.1519999999999999

In [52]:
# RCTの効果とIPW重み付け後の効果の差分
0.9251 - 0.7484

0.17670000000000008

書籍の記載より大きい。傾向スコアマッチング後の推定された効果よりはRCTで推定された効果に近く、p-valueも小さい

### 重み付きデータの共変量のバランス

In [53]:
## 調整前のAbsolute Mean Difference
unadjusted_df = pd.get_dummies(
    ml_male_df[['treatment', 'recency', 'history_segment', 'channel', 'zip_code']], columns=['history_segment', 'channel', 'zip_code']
)
unadjusted_amd = calc_absolute_mean_difference(unadjusted_df)

# IPWで重み付け後のAbusolute Mean Difference
# 重みのぶんレコードを増やして計算する（もっといいやり方を知りたい）
after_weighted_df = pd.get_dummies(
    ml_male_df[['treatment', 'recency', 'history_segment', 'channel', 'zip_code']], columns=['history_segment', 'channel', 'zip_code']
)
weights_int = (weights * 100).astype(int)
weighted_df = []
for i, value in enumerate(after_weighted_df.values):
    weighted_df.append(np.tile(value, (weights_int[i], 1)))
weighted_df = np.concatenate(weighted_df).reshape(-1, 15)
weighted_df = pd.DataFrame(weighted_df)
weighted_df.columns = after_weighted_df.columns
after_weighted_amd = calc_absolute_mean_difference(weighted_df)

In [54]:
balance_df = pd.concat([
    pd.DataFrame({'Absolute Mean Difference': unadjusted_amd, 'Sample': 'Unadjusted'}),
    pd.DataFrame({'Absolute Mean Difference': after_weighted_amd, 'Sample': 'Adjusted'})
])
fig = px.scatter(balance_df, x='Absolute Mean Difference', y=balance_df.index, color='Sample',
                title='重み付けしたデータでの共変量のバランス')
fig.show()

In [55]:
fig.write_html('ch3_plot3.html', auto_open=False)