# url: https://qiita.com/nekoumei/items/648726e89d05cba6f432

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')

## (2) データの読み込み

In [2]:
email_data = pd.read_csv('http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv')

## (3) ルールによるメールの配信を行ったログを作成

In [3]:
## データの整形とrunning variableの追加
male_data = email_data[email_data.segment.isin(['Mens E-Mail', 'No E-Mail'])].copy()
male_data['treatment'] = male_data.segment.apply(lambda x: 1 if x=='Mens E-Mail' else 0)
male_data['history_log'] = np.log(male_data.history)

In [4]:
male_data

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,history_log
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0,5.796301
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1,6.515942
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1,6.514816
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1,4.621437
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0,5.486538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63994,7,1) $0 - $100,86.46,0,1,Urban,0,Web,Mens E-Mail,0,0,0.0,1,4.459682
63995,10,2) $100 - $200,105.54,1,0,Urban,0,Web,Mens E-Mail,0,0,0.0,1,4.659090
63996,5,1) $0 - $100,38.91,0,1,Urban,1,Phone,Mens E-Mail,0,0,0.0,1,3.661251
63997,6,1) $0 - $100,29.99,1,0,Urban,1,Phone,Mens E-Mail,0,0,0.0,1,3.400864


In [5]:
## cut-off の値を指定
threshold_value = 5.5

## ルールによる介入を再現したデータを作成
## cut-offよりrunning variableが大きければが配信されたデータのみ残す
## 逆の場合には配信されなかったデータのみ残す
## running variableを0.1単位で区切ったグループわけの変数も追加しておく
male_data['history_log_grp'] = np.round(male_data.history_log / 0.1) * 0.1

rdd_data = male_data[
    ((male_data.history_log > threshold_value) & (male_data.segment == 'Mens E-Mail')) |
    ((male_data.history_log <= threshold_value) & (male_data.segment == 'No E-Mail'))
]

In [6]:
rdd_data

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,history_log,history_log_grp
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1,6.515942,6.5
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1,6.514816,6.5
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0,5.486538,5.5
15,3,1) $0 - $100,58.13,1,0,Urban,1,Web,No E-Mail,1,0,0.0,0,4.062682,4.1
19,5,"6) $750 - $1,000",828.42,1,0,Surburban,1,Multichannel,Mens E-Mail,0,0,0.0,1,6.719520,6.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63983,2,1) $0 - $100,83.03,0,1,Urban,0,Phone,No E-Mail,0,0,0.0,0,4.419202,4.4
63987,1,1) $0 - $100,79.70,1,0,Surburban,1,Web,No E-Mail,0,0,0.0,0,4.378270,4.4
63990,6,1) $0 - $100,80.02,0,1,Surburban,0,Phone,No E-Mail,0,0,0.0,0,4.382277,4.4
63992,1,5) $500 - $750,519.69,1,1,Urban,1,Phone,Mens E-Mail,0,0,0.0,1,6.253232,6.3


## (4) RCTデータとRDDデータの傾向の比較

### running variableとサイト来訪率のプロット(RCTデータ)

In [7]:
summarised = male_data.groupby(['history_log_grp', 'segment']).agg(visit=('visit', 'mean'), N=('visit', 'count')).reset_index()
summarised = summarised[summarised.N > 10]

In [8]:
summarised

Unnamed: 0,history_log_grp,segment,visit,N
0,3.4,Mens E-Mail,0.146545,2750
1,3.4,No E-Mail,0.086988,2805
2,3.5,Mens E-Mail,0.169118,272
3,3.5,No E-Mail,0.084942,259
4,3.6,Mens E-Mail,0.158576,309
...,...,...,...,...
81,7.4,No E-Mail,0.240000,25
82,7.5,Mens E-Mail,0.173913,23
83,7.5,No E-Mail,0.285714,14
84,7.6,Mens E-Mail,0.333333,12


In [9]:
fig = px.scatter(summarised, x='history_log_grp', y='visit', color='segment', symbol='segment', size='N'
           , title='5.2 実験データにおける来訪率とlog(history_i)')
fig.show()

In [10]:
fig.write_html('ch5_plot1.html')

### running variableとサイト来訪率のプロット(RDDデータ)

In [11]:
summarised = rdd_data.groupby(['history_log_grp', 'segment']).agg(visit=('visit', 'mean'), N=('visit', 'count')).reset_index()
summarised = summarised[summarised.N > 10]

In [12]:
summarised

Unnamed: 0,history_log_grp,segment,visit,N
0,3.4,No E-Mail,0.086988,2805
1,3.5,No E-Mail,0.084942,259
2,3.6,No E-Mail,0.094637,317
3,3.7,No E-Mail,0.07309,301
4,3.8,No E-Mail,0.078652,356
5,3.9,No E-Mail,0.074286,350
6,4.0,No E-Mail,0.07672,378
7,4.1,No E-Mail,0.078947,418
8,4.2,No E-Mail,0.087719,456
9,4.3,No E-Mail,0.066794,524


In [13]:
fig = px.scatter(summarised, x='history_log_grp', y='visit', color='segment', symbol='segment', size='N'
           , title='5.3 非実験データにおける来訪率とlog(history_i)')

fig.add_trace(go.Scatter(
    x=[threshold_value, threshold_value],
    y=[0, 0.35],
    mode="lines",
    line=dict(color='gray', dash='dot'),
    name='cut-off value'
))

fig.show()

In [14]:
fig.write_html('ch5_plot2.html')

## (5) 集計による分析

In [15]:
## RCTデータでの比較
male_data[
    (male_data.history_log > 5) & (male_data.history_log < 6)
].groupby('treatment').agg(count=('visit', 'count'), visit_rate=('visit', 'mean'))

Unnamed: 0_level_0,count,visit_rate
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7167,0.12125
1,7135,0.20042


In [16]:
## RDDデータでの比較
rdd_data.groupby('treatment').agg(count=('visit', 'count'), visit_rate=('visit', 'mean'))

Unnamed: 0_level_0,count,visit_rate
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13926,0.090694
1,7366,0.224002


## (6) 回帰分析による分析

In [17]:
## 線形回帰による分析
y = rdd_data.visit
X = rdd_data[['treatment', 'history_log']]
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
coef = results.summary().tables[1]
coef = pd.read_html(coef.as_html(), header=0, index_col=0)[0]
pd.DataFrame(coef.loc['treatment']).T

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
treatment,0.1137,0.008,14.24,0.0,0.098,0.129


### 追加実験：直感的な回帰不連続デザインをやってみる
カットオフ値より左側と右側でそれぞれ回帰モデルをつくり、各モデルのカットオフ値における推定値の差分を見る  
（理論的に正しいかどうかは分からない）

In [18]:
## 非線形回帰による分析
### 多項式回帰を行うクラス
class NonliniearRegressor:
    def __init__(self, degree=4, poly_features=None):
        self.degree = degree
        self.poly_features = poly_features

    def _preprocess(self, X):
        return PolynomialFeatures(degree=self.degree).fit_transform(X)

    def fit(self, X, y):
        X = self._preprocess(X)
        self.model = sm.OLS(y, X) # PolynomialFeaturesにバイアス項が含まれている
        self.results = self.model.fit()
        coef = self.results.summary().tables[1]
        self.coef = pd.read_html(coef.as_html(), header=0, index_col=0)[0]

    def predict(self, X):
        X = self._preprocess(X)
        return self.model.predict(self.results.params, X)

In [19]:
# カットオフ値前後でデータを分割する
left_rdd_data = rdd_data[rdd_data.history_log < threshold_value].copy()
right_rdd_data = rdd_data[rdd_data.history_log >= threshold_value].copy()

left_rdd_data.reset_index(drop=True, inplace=True)
right_rdd_data.reset_index(drop=True, inplace=True)

# 左右ごとに回帰する
left_model = NonliniearRegressor(poly_features=['history_log'])
left_model.fit(left_rdd_data[['history_log']], left_rdd_data.visit)
left_rdd_data['y_pred'] = left_model.predict(left_rdd_data[['history_log']])

right_model = NonliniearRegressor(poly_features=['history_log'])
right_model.fit(right_rdd_data[['history_log']], right_rdd_data.visit)
right_rdd_data['y_pred'] = right_model.predict(right_rdd_data[['history_log']])

# カットオフ値での左右モデルの予測値の差分を見る
print(right_model.predict([[5.5]]) - left_model.predict([[threshold_value]]))

[0.07407941]


In [20]:
left_rdd_data

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,history_log,history_log_grp,y_pred
0,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0,5.486538,5.5,0.130720
1,3,1) $0 - $100,58.13,1,0,Urban,1,Web,No E-Mail,1,0,0.0,0,4.062682,4.1,0.080307
2,9,1) $0 - $100,29.99,0,1,Surburban,1,Phone,No E-Mail,0,0,0.0,0,3.400864,3.4,0.087529
3,2,1) $0 - $100,29.99,0,1,Urban,1,Phone,No E-Mail,0,0,0.0,0,3.400864,3.4,0.087529
4,4,1) $0 - $100,78.24,1,0,Surburban,0,Web,No E-Mail,0,0,0.0,0,4.359781,4.4,0.082828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13921,10,2) $100 - $200,168.21,0,1,Surburban,0,Phone,No E-Mail,0,0,0.0,0,5.125213,5.1,0.096893
13922,4,2) $100 - $200,125.53,0,1,Rural,1,Phone,No E-Mail,0,0,0.0,0,4.832545,4.8,0.087487
13923,2,1) $0 - $100,83.03,0,1,Urban,0,Phone,No E-Mail,0,0,0.0,0,4.419202,4.4,0.083283
13924,1,1) $0 - $100,79.70,1,0,Surburban,1,Web,No E-Mail,0,0,0.0,0,4.378270,4.4,0.082971


In [21]:
right_rdd_data

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,history_log,history_log_grp,y_pred
0,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1,6.515942,6.5,0.221742
1,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1,6.514816,6.5,0.221740
2,5,"6) $750 - $1,000",828.42,1,0,Surburban,1,Multichannel,Mens E-Mail,0,0,0.0,1,6.719520,6.7,0.224172
3,9,3) $200 - $350,334.24,1,0,Urban,0,Web,Mens E-Mail,0,0,0.0,1,5.811859,5.8,0.225486
4,1,5) $500 - $750,514.52,0,1,Surburban,1,Web,Mens E-Mail,0,0,0.0,1,6.243234,6.2,0.223693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7361,1,3) $200 - $350,309.96,1,1,Surburban,1,Phone,Mens E-Mail,0,0,0.0,1,5.736443,5.7,0.223419
7362,4,3) $200 - $350,337.36,1,0,Urban,0,Web,Mens E-Mail,1,0,0.0,1,5.821151,5.8,0.225661
7363,2,"6) $750 - $1,000",772.99,1,1,Surburban,1,Web,Mens E-Mail,0,0,0.0,1,6.650266,6.7,0.222870
7364,1,5) $500 - $750,519.69,1,1,Urban,1,Phone,Mens E-Mail,0,0,0.0,1,6.253232,6.3,0.223562


In [22]:
left_rdd_data

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,history_log,history_log_grp,y_pred
0,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0,5.486538,5.5,0.130720
1,3,1) $0 - $100,58.13,1,0,Urban,1,Web,No E-Mail,1,0,0.0,0,4.062682,4.1,0.080307
2,9,1) $0 - $100,29.99,0,1,Surburban,1,Phone,No E-Mail,0,0,0.0,0,3.400864,3.4,0.087529
3,2,1) $0 - $100,29.99,0,1,Urban,1,Phone,No E-Mail,0,0,0.0,0,3.400864,3.4,0.087529
4,4,1) $0 - $100,78.24,1,0,Surburban,0,Web,No E-Mail,0,0,0.0,0,4.359781,4.4,0.082828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13921,10,2) $100 - $200,168.21,0,1,Surburban,0,Phone,No E-Mail,0,0,0.0,0,5.125213,5.1,0.096893
13922,4,2) $100 - $200,125.53,0,1,Rural,1,Phone,No E-Mail,0,0,0.0,0,4.832545,4.8,0.087487
13923,2,1) $0 - $100,83.03,0,1,Urban,0,Phone,No E-Mail,0,0,0.0,0,4.419202,4.4,0.083283
13924,1,1) $0 - $100,79.70,1,0,Surburban,1,Web,No E-Mail,0,0,0.0,0,4.378270,4.4,0.082971


In [23]:
right_rdd_data

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,history_log,history_log_grp,y_pred
0,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1,6.515942,6.5,0.221742
1,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1,6.514816,6.5,0.221740
2,5,"6) $750 - $1,000",828.42,1,0,Surburban,1,Multichannel,Mens E-Mail,0,0,0.0,1,6.719520,6.7,0.224172
3,9,3) $200 - $350,334.24,1,0,Urban,0,Web,Mens E-Mail,0,0,0.0,1,5.811859,5.8,0.225486
4,1,5) $500 - $750,514.52,0,1,Surburban,1,Web,Mens E-Mail,0,0,0.0,1,6.243234,6.2,0.223693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7361,1,3) $200 - $350,309.96,1,1,Surburban,1,Phone,Mens E-Mail,0,0,0.0,1,5.736443,5.7,0.223419
7362,4,3) $200 - $350,337.36,1,0,Urban,0,Web,Mens E-Mail,1,0,0.0,1,5.821151,5.8,0.225661
7363,2,"6) $750 - $1,000",772.99,1,1,Surburban,1,Web,Mens E-Mail,0,0,0.0,1,6.650266,6.7,0.222870
7364,1,5) $500 - $750,519.69,1,1,Urban,1,Phone,Mens E-Mail,0,0,0.0,1,6.253232,6.3,0.223562


書籍の記述とほぼ一致

### 追加：回帰の結果の可視化

In [24]:
left_rdd_data = left_rdd_data.sort_values('history_log')
right_rdd_data = right_rdd_data.sort_values('history_log')

In [25]:
left_rdd_data

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,history_log,history_log_grp,y_pred
6686,11,1) $0 - $100,29.99,0,1,Surburban,1,Web,No E-Mail,0,0,0.0,0,3.400864,3.4,0.087529
3370,6,1) $0 - $100,29.99,0,1,Urban,1,Phone,No E-Mail,0,0,0.0,0,3.400864,3.4,0.087529
9705,7,1) $0 - $100,29.99,1,0,Urban,0,Phone,No E-Mail,0,0,0.0,0,3.400864,3.4,0.087529
9698,5,1) $0 - $100,29.99,1,0,Rural,0,Phone,No E-Mail,0,0,0.0,0,3.400864,3.4,0.087529
3381,7,1) $0 - $100,29.99,0,1,Rural,0,Web,No E-Mail,0,0,0.0,0,3.400864,3.4,0.087529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10363,1,3) $200 - $350,244.38,1,0,Surburban,0,Phone,No E-Mail,0,0,0.0,0,5.498724,5.5,0.132541
2946,6,3) $200 - $350,244.46,1,1,Surburban,0,Web,No E-Mail,0,0,0.0,0,5.499052,5.5,0.132590
6736,1,3) $200 - $350,244.58,1,0,Urban,1,Phone,No E-Mail,0,0,0.0,0,5.499542,5.5,0.132665
348,1,3) $200 - $350,244.66,0,1,Urban,1,Phone,No E-Mail,0,0,0.0,0,5.499869,5.5,0.132715


In [26]:
right_rdd_data

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,history_log,history_log_grp,y_pred
5634,9,3) $200 - $350,244.70,1,1,Urban,0,Web,Mens E-Mail,0,0,0.0,1,5.500033,5.5,0.206817
836,1,3) $200 - $350,244.78,1,0,Rural,0,Phone,Mens E-Mail,0,0,0.0,1,5.500360,5.5,0.206854
6692,9,3) $200 - $350,244.80,1,0,Rural,1,Multichannel,Mens E-Mail,0,0,0.0,1,5.500442,5.5,0.206863
430,8,3) $200 - $350,244.81,0,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1,5.500482,5.5,0.206868
6127,7,3) $200 - $350,244.88,1,0,Surburban,1,Phone,Mens E-Mail,0,0,0.0,1,5.500768,5.5,0.206899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614,1,"7) $1,000 +",2728.69,1,1,Urban,1,Phone,Mens E-Mail,1,0,0.0,1,7.911577,7.9,0.288931
3743,1,"7) $1,000 +",2766.42,0,1,Urban,1,Multichannel,Mens E-Mail,0,0,0.0,1,7.925309,7.9,0.289023
7031,1,"7) $1,000 +",2859.63,1,1,Urban,1,Web,Mens E-Mail,0,0,0.0,1,7.958448,8.0,0.289013
5298,1,"7) $1,000 +",2895.11,1,1,Urban,1,Multichannel,Mens E-Mail,0,0,0.0,1,7.970778,8.0,0.288923


In [27]:
fig = px.scatter(summarised, x='history_log_grp', y='visit', color='segment', symbol='segment', size='N'
           , title='RDDの可視化：非実験データにおける来訪率とlog(history_i)')

fig.add_trace(go.Scatter(
    x=[threshold_value, threshold_value],
    y=[0, 0.35],
    mode="lines",
    line=dict(color='gray', dash='dot'),
    name='cut-off value'
))

fig.add_trace(go.Scatter(
    x=left_rdd_data.history_log,
    y=left_rdd_data.y_pred,
    mode='lines',
    name='回帰の結果(left)'
))

fig.add_trace(go.Scatter(
    x=right_rdd_data.history_log,
    y=right_rdd_data.y_pred,
    mode='lines',
    name='回帰の結果(right)'
))

In [28]:
fig.write_html('ch5_plot2_2.html')

## rddtoolsの再現（非線形回帰による分析）

In [29]:
class RDDRegression:
# Rパッケージrddtoolsのrdd_reg_lmを再現する
# 参考：https://cran.r-project.org/web/packages/rddtools/rddtools.pdf P23
    def __init__(self, cut_point, degree=4):
        self.cut_point = cut_point
        self.degree = degree

    def _preprocess(self, X):
        X = X - threshold_value
        X_poly = PolynomialFeatures(degree=self.degree, include_bias=False).fit_transform(X)
        D_df = X.applymap(lambda x: 1 if x >= 0 else 0)
        X = pd.DataFrame(X_poly, columns=[f'X^{i+1}' for i in range(X_poly.shape[1])])
        X['D'] = D_df
        for i in range(X_poly.shape[1]):
            X[f'D_X^{i+1}'] = X_poly[:, i] * X['D']
        return X

    def fit(self, X, y):
        X = X.copy()
        X = self._preprocess(X)
        self.X = X
        self.y = y
        X = sm.add_constant(X)
        self.model = sm.OLS(y, X)
        self.results = self.model.fit()
        coef = self.results.summary().tables[1]
        self.coef = pd.read_html(coef.as_html(), header=0, index_col=0)[0]

    def predict(self, X):
        X = self._preprocess(X)
        X = sm.add_constant(X)
        return self.model.predict(self.results.params, X)

In [30]:
rdd_data = rdd_data.reset_index(drop=True)
rddr = RDDRegression(cut_point=threshold_value, degree=4)
rddr.fit(rdd_data[['history_log']], rdd_data.visit)
coef = rddr.results.summary().tables[1]
coef = pd.read_html(coef.as_html(), header=0, index_col=0)[0]
coef

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1327,0.014,9.654,0.0,0.106,0.16
X^1,0.1522,0.092,1.661,0.097,-0.027,0.332
X^2,0.1877,0.178,1.056,0.291,-0.161,0.536
X^3,0.1068,0.126,0.847,0.397,-0.14,0.354
X^4,0.0224,0.029,0.769,0.442,-0.035,0.079
D,0.0741,0.02,3.774,0.0,0.036,0.113
D_X^1,-0.0406,0.135,-0.3,0.764,-0.306,0.225
D_X^2,-0.3928,0.271,-1.449,0.147,-0.924,0.139
D_X^3,0.0278,0.198,0.14,0.888,-0.36,0.415
D_X^4,-0.0484,0.047,-1.032,0.302,-0.14,0.044


変数Dのcoefを見る。書籍のrddtoolsによる推定とp-value含めほぼ一致。

In [31]:
y_pred = rddr.predict(rdd_data[['history_log']])
visualized = pd.DataFrame({'history_log': rdd_data['history_log'], 'y_pred': y_pred})
visualized = visualized.sort_values('history_log')

In [32]:
y_pred

array([0.22174223, 0.22173994, 0.13071989, ..., 0.08300197, 0.22356222,
       0.2248339 ])

In [33]:
visualized

Unnamed: 0,history_log,y_pred
14132,3.400864,0.087529
20322,3.400864,0.087529
13507,3.400864,0.087529
20918,3.400864,0.087529
13509,3.400864,0.087529
...,...,...
1815,7.911577,0.288931
10996,7.925309,0.289023
20363,7.958448,0.289013
15417,7.970778,0.288923


### 追加：回帰の結果の可視化

In [34]:
fig = px.scatter(summarised, x='history_log_grp', y='visit', color='segment', symbol='segment', size='N'
           , title='RDDの可視化：非実験データにおける来訪率とlog(history_i)')

fig.add_trace(go.Scatter(
    x=[threshold_value, threshold_value],
    y=[0, 0.35],
    mode="lines",
    line=dict(color='gray', dash='dot'),
    name='cut-off value'
))

fig.add_trace(go.Scatter(
    x=visualized.history_log,
    y=visualized.y_pred,
    mode='lines',
    name='回帰の結果'
))

In [35]:
fig.write_html('ch5_plot2_3.html')

## (7) 分析に使うデータの幅と分析結果のプロット

In [36]:
bound_list = [i / 100 for i in range(2, 101)]
lates = []
Ns = []
ses = []
for bound in bound_list:
    bounded_data = rdd_data[(rdd_data.history_log >= threshold_value - bound) & (rdd_data.history_log < threshold_value + bound)]
    agg_data = bounded_data.groupby('treatment').agg(count=('visit', 'count'), visit_rate=('visit', 'mean'))
    lates.append(agg_data.loc[1, 'visit_rate'] - agg_data.loc[0, 'visit_rate'])
    N = sum(agg_data['count'])
    Ns.append(N)
    ses.append(np.sqrt(sum(agg_data.visit_rate ** 2)) / np.sqrt(N))

result_data = pd.DataFrame({
    'bound': bound_list,
    'late': lates,
    'N': Ns,
    'se': ses
})

In [37]:
result_data

Unnamed: 0,bound,late,N,se
0,0.02,0.112496,281,0.014399
1,0.03,0.097988,408,0.012047
2,0.04,0.096886,578,0.010568
3,0.05,0.098698,729,0.010085
4,0.06,0.090850,873,0.008900
...,...,...,...,...
94,0.96,0.124169,12439,0.002173
95,0.97,0.124086,12541,0.002168
96,0.98,0.123609,12630,0.002154
97,0.99,0.123452,12723,0.002145


In [38]:
fig = px.line(result_data, x='bound', y='late', title='5.5 利用するデータの範囲と推定結果')

fig.add_trace(go.Scatter(
    x=result_data.bound,
    y=result_data.late - (1.96 * result_data.se),
    fill=None,
    mode='lines',
    line_color='indigo',
    ))

fig.add_trace(go.Scatter(
    x=result_data.bound,
    y=result_data.late + (1.96 * result_data.se),
    fill='tonexty', # fill area between trace0 and trace1
    mode='lines', line_color='indigo'))

In [39]:
fig.write_html('ch5_plot3.html')

## (8) nonparametric RDD

 Imbens-Kalyanaraman(2011)の手法で最適なデータ幅を推定するみたいだが、よくわからなかったので雑に調べる

In [40]:
train, test = train_test_split(rdd_data, test_size=0.1, random_state=0)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

bound_list = [i / 100 for i in range(2, 301)]
min_mse = np.inf
# 0.01刻みでバンド幅を変えて、test dataのMSEが小さくなるときのバンド幅を出す
for bound in tqdm_notebook(bound_list):
    bounded_data = train[(train.history_log >= threshold_value - bound) & (train.history_log < threshold_value + bound)]
    bounded_data = bounded_data.reset_index(drop=True)
    rddr = RDDRegression(cut_point=threshold_value, degree=4)
    rddr.fit(bounded_data[['history_log']], bounded_data.visit)
    mse = mean_squared_error(test.history_log, rddr.predict(test[['history_log']]))
    if mse < min_mse:
        min_mse = mse
        min_bound = bound
        min_rddr = rddr
        print(f'min mse: {min_mse}, bound: {min_bound}')

  0%|          | 0/299 [00:00<?, ?it/s]

min mse: 2.5243202077622704e+17, bound: 0.02
min mse: 3081618022828803.0, bound: 0.03
min mse: 109616673657873.81, bound: 0.04
min mse: 29447688284719.94, bound: 0.05
min mse: 2246295024526.888, bound: 0.06
min mse: 820539191140.5814, bound: 0.07
min mse: 727782333876.822, bound: 0.09
min mse: 141322573518.75885, bound: 0.1
min mse: 50720103617.071556, bound: 0.11
min mse: 25016889912.390377, bound: 0.12
min mse: 2487593264.34267, bound: 0.13
min mse: 1209245062.9746397, bound: 0.14
min mse: 896062178.4057399, bound: 0.15
min mse: 291190524.2976007, bound: 0.16
min mse: 23781040.40587693, bound: 0.17
min mse: 2077515.3012350122, bound: 0.2
min mse: 815887.1912123049, bound: 0.22
min mse: 206106.4522611907, bound: 0.29
min mse: 24325.29532961676, bound: 0.32
min mse: 1660.5301907689302, bound: 0.38
min mse: 1610.8386911460998, bound: 0.43
min mse: 175.54776288760124, bound: 0.51
min mse: 104.02217011238449, bound: 0.55
min mse: 60.43729325781083, bound: 0.59
min mse: 57.108642163798265,

In [41]:
bounded_data = rdd_data[(rdd_data.history_log >= threshold_value - min_bound) & (rdd_data.history_log < threshold_value + min_bound)]

In [42]:
bounded_data

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,history_log,history_log_grp
2,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0,5.486538,5.5
8,2,3) $200 - $350,203.35,1,0,Rural,0,Web,No E-Mail,0,0,0.0,0,5.314929,5.3
9,9,3) $200 - $350,334.24,1,0,Urban,0,Web,Mens E-Mail,0,0,0.0,1,5.811859,5.8
10,1,5) $500 - $750,514.52,0,1,Surburban,1,Web,Mens E-Mail,0,0,0.0,1,6.243234,6.2
12,7,5) $500 - $750,520.43,0,1,Surburban,1,Web,Mens E-Mail,0,0,0.0,1,6.254655,6.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21280,4,3) $200 - $350,337.36,1,0,Urban,0,Web,Mens E-Mail,1,0,0.0,1,5.821151,5.8
21285,10,2) $100 - $200,168.21,0,1,Surburban,0,Phone,No E-Mail,0,0,0.0,0,5.125213,5.1
21286,4,2) $100 - $200,125.53,0,1,Rural,1,Phone,No E-Mail,0,0,0.0,0,4.832545,4.8
21290,1,5) $500 - $750,519.69,1,1,Urban,1,Phone,Mens E-Mail,0,0,0.0,1,6.253232,6.3


In [43]:
coef = min_rddr.results.summary().tables[1]
coef = pd.read_html(coef.as_html(), header=0, index_col=0)[0]

In [44]:
coef

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1348,0.024,5.716,0.0,0.089,0.181
X^1,0.2093,0.358,0.584,0.559,-0.493,0.912
X^2,0.5904,1.608,0.367,0.713,-2.561,3.742
X^3,0.9937,2.67,0.372,0.71,-4.24,6.228
X^4,0.5926,1.463,0.405,0.685,-2.276,3.461
D,0.1031,0.033,3.104,0.002,0.038,0.168
D_X^1,-0.8053,0.515,-1.563,0.118,-1.815,0.204
D_X^2,2.8832,2.338,1.233,0.218,-1.7,7.467
D_X^3,-7.343,3.913,-1.877,0.061,-15.013,0.327
D_X^4,3.0153,2.158,1.397,0.162,-1.214,7.245


In [45]:
print(f'''
Bandwidth:\t{min_bound}
Observations:\t{len(bounded_data)}
Estimate:\t\t{min_rddr.results.params['D']: .4f}
std err:\t\t{coef.loc['D', 'std err']}
''')


Bandwidth:	0.91
Observations:	11960
Estimate:		 0.1031
std err:		0.033



RDestimateの推定（書籍の記述）より少しバンド幅が広い

### 分析結果の可視化

In [46]:
visualized = rdd_data.copy()
visualized['y_pred'] = min_rddr.predict(rdd_data[['history_log']])
# 可視化する際cut off値で繋げないようにデータを分ける
left_rdd_data = visualized[visualized.history_log < threshold_value].copy()
right_rdd_data = visualized[visualized.history_log >= threshold_value].copy()
left_rdd_data = left_rdd_data.sort_values('history_log')
left_rdd_data['y_pred'] = left_rdd_data.y_pred.apply(lambda x: np.nan if x > 0.2 else x) # 可視化のため大きい数値をなくす
right_rdd_data = right_rdd_data.sort_values('history_log')
right_rdd_data['y_pred'] = right_rdd_data.y_pred.apply(lambda x: np.nan if x > 0.5 else x) # 可視化のため大きい数値をなくす


fig = px.scatter(summarised, x='history_log_grp', y='visit', color='segment', symbol='segment', size='N'
           , title='RDDの可視化：バンド幅を調整後')

fig.add_trace(go.Scatter(
    x=[threshold_value, threshold_value],
    y=[0, 0.35],
    mode="lines",
    line=dict(color='gray', dash='dot'),
    name='cut-off value'
))

fig.add_trace(go.Scatter(
    x=left_rdd_data.history_log,
    y=left_rdd_data.y_pred,
    mode='lines',
    name='回帰の結果(left)'
))

fig.add_trace(go.Scatter(
    x=right_rdd_data.history_log,
    y=right_rdd_data.y_pred,
    mode='lines',
    name='回帰の結果(right)'
))

In [47]:
fig.write_html('ch5_plot4.html')

なんかあまり良くない気がする。やっぱり雑に調整するのは良くなかったのか