## 数据预处理

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

%matplotlib inline



In [2]:
# 计算effect_size  0.13为当前的转换率  0.15为目标转化率 也就是说我们希望通过新的设计带来2%的提升
effect_size = sms.proportion_effectsize(0.13, 0.15)   

required_n = sms.NormalIndPower().solve_power(
    effect_size, # 传入上面计算的 effect_size
    power=0.8, # 设置 1-β = 80%
    alpha=0.05, # 设置 α 为5%
    ratio=1  # 对照组和测试组人一样, 这里的ratio 比例就是1
    )                                                  
#对结果向上取整
required_n = ceil(required_n)                         

print(required_n)

4720


- 加载数据

In [3]:
df = pd.read_csv('data/ab_data.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


- 创建透视表, 查询是否对照组看到的都是老页面

In [5]:
pd.pivot_table(df, index='group', columns='landing_page', values='user_id', aggfunc='count')

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,1928,145274
treatment,145311,1965


- 在我们进行后续处理之前, 还要查看是否有用户进行了多次操作

In [7]:
session_counts = df['user_id'].value_counts(ascending=False)

In [8]:
session_counts[session_counts>1].count()

3894

In [12]:
users = session_counts[session_counts<2].index

In [13]:
df = df[df['user_id'].isin(users)]

In [14]:
df

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
...,...,...,...,...,...
294473,751197,2017-01-03 22:28:38.630509,control,old_page,0
294474,945152,2017-01-12 00:51:57.078372,control,old_page,0
294475,734608,2017-01-22 11:45:03.439544,control,old_page,0
294476,697314,2017-01-15 01:20:28.957438,control,old_page,0


- 数据采样

In [21]:
control_sample = df[df['group'] == 'control'].sample(n=required_n, random_state=22)
treatment_sample = df[df['group'] == 'treatment'].sample(n=required_n, random_state=22)


In [17]:
treatment_sample

Unnamed: 0,user_id,timestamp,group,landing_page,converted
259346,860447,2017-01-11 21:20:47.193292,treatment,new_page,0
237647,845654,2017-01-06 21:49:33.725054,treatment,new_page,0
73088,833106,2017-01-11 21:56:24.637002,treatment,new_page,0
121106,665687,2017-01-08 04:17:45.135586,treatment,new_page,0
78032,658409,2017-01-22 13:18:58.765132,treatment,new_page,0
...,...,...,...,...,...
46153,908512,2017-01-14 22:02:29.922674,treatment,new_page,0
235886,873211,2017-01-05 00:57:16.167151,treatment,new_page,0
268794,631276,2017-01-20 18:56:58.167809,treatment,new_page,0
190461,662301,2017-01-03 08:10:57.768806,treatment,new_page,0


In [26]:
ab_test = pd.concat([control_sample, treatment_sample], axis=0)

In [27]:
ab_test

Unnamed: 0,user_id,timestamp,group,landing_page,converted
186628,763854,2017-01-21 03:43:17.188315,control,old_page,0
57904,690555,2017-01-18 06:38:13.079449,control,old_page,0
13276,861520,2017-01-06 21:13:40.044766,control,old_page,0
228415,630778,2017-01-05 16:42:36.995204,control,old_page,0
96065,656634,2017-01-04 15:31:21.676130,control,old_page,0
...,...,...,...,...,...
46153,908512,2017-01-14 22:02:29.922674,treatment,new_page,0
235886,873211,2017-01-05 00:57:16.167151,treatment,new_page,0
268794,631276,2017-01-20 18:56:58.167809,treatment,new_page,0
190461,662301,2017-01-03 08:10:57.768806,treatment,new_page,0


In [28]:
ab_test.reset_index(inplace=True, drop=True)

In [29]:
ab_test

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,763854,2017-01-21 03:43:17.188315,control,old_page,0
1,690555,2017-01-18 06:38:13.079449,control,old_page,0
2,861520,2017-01-06 21:13:40.044766,control,old_page,0
3,630778,2017-01-05 16:42:36.995204,control,old_page,0
4,656634,2017-01-04 15:31:21.676130,control,old_page,0
...,...,...,...,...,...
9435,908512,2017-01-14 22:02:29.922674,treatment,new_page,0
9436,873211,2017-01-05 00:57:16.167151,treatment,new_page,0
9437,631276,2017-01-20 18:56:58.167809,treatment,new_page,0
9438,662301,2017-01-03 08:10:57.768806,treatment,new_page,0


- 查看两组数据情况

In [30]:
ab_test.groupby('group')['landing_page'].value_counts()

group      landing_page
control    old_page        4720
treatment  new_page        4720
Name: landing_page, dtype: int64

## 分析实验

- 计算一下两组的转化率和标准差

In [31]:
conversion_rates = ab_test.groupby('group')['converted'].mean().to_frame()
conversion_rates 

conversion_rates.style.format('{:.3f}')

Unnamed: 0_level_0,converted
group,Unnamed: 1_level_1
control,0.123
treatment,0.126


- 假设检验

- 我们可以使用statsmodels.stats.proportion 模块来计算P值和置信区间

In [32]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

In [33]:
control_results = ab_test[ab_test['group'] == 'control']['converted']     #获取对照组是否转化的数据
treatment_results = ab_test[ab_test['group'] == 'treatment']['converted'] #获取实验组是否转化的数据

In [35]:
control_results.value_counts()

0    4138
1     582
Name: converted, dtype: int64

In [36]:
treatment_results.value_counts()

0    4127
1     593
Name: converted, dtype: int64

In [37]:
n_con = control_results.count() # 获取对照组人数
n_treat = treatment_results.count()  # 获取实验组人数

In [38]:
nobs = [n_con, n_treat]

In [39]:
nobs

[4720, 4720]

In [40]:
successes = [control_results.sum(), treatment_results.sum()] 

In [41]:
successes

[582, 593]

In [42]:
proportion_confint(88, 100, alpha=0.05)

(0.8163087092715731, 0.943691290728427)

In [43]:
z_stat, pval = proportions_ztest(successes, nobs=nobs)  #计算P值
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05) #计算置信区间

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

z statistic: -0.34
p-value: 0.732
ci 95% for control group: [0.114, 0.133]
ci 95% for treatment group: [0.116, 0.135]


In [45]:
proportions_ztest?

[0;31mSignature:[0m
[0mproportions_ztest[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mcount[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnobs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvalue[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malternative[0m[0;34m=[0m[0;34m'two-sided'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprop_var[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Test for proportions based on normal (z) test

Parameters
----------
count : {int, array_like}
    the number of successes in nobs trials. If this is array_like, then
    the assumption is that this represents the number of successes for
    each independent sample
nobs : {int, array_like}
    the number of trials or observations, with the same length as
    count.
value : float, array_like or None, optional
    This is the value of the null hypothesis equal to the proportion in the
    case of 