In [71]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import chi2_contingency
from random import randint
from bootstrapped import bootstrap as bs
from bootstrapped import compare_functions as bs_compare
from bootstrapped import stats_functions as bs_stats
from datetime import datetime as dt

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('AB_Test_Results.csv', sep = ',')

In [3]:
data.shape

(10000, 3)

In [4]:
data.columns

Index(['USER_ID', 'VARIANT_NAME', ' REVENUE '], dtype='object')

In [5]:
data.rename(columns={' REVENUE ': 'REVENUE'}).columns

Index(['USER_ID', 'VARIANT_NAME', 'REVENUE'], dtype='object')

In [6]:
data = data.rename(columns={' REVENUE ': 'REVENUE'})

In [7]:
data.describe()

Unnamed: 0,USER_ID,REVENUE
count,10000.0,10000.0
mean,4981.0802,0.099447
std,2890.590115,2.318529
min,2.0,0.0
25%,2468.75,0.0
50%,4962.0,0.0
75%,7511.5,0.0
max,10000.0,196.01


In [8]:
categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
numerical_columns   = [c for c in data.columns if data[c].dtype.name != 'object']
print(categorical_columns)
print(numerical_columns)

['VARIANT_NAME']
['USER_ID', 'REVENUE']


In [9]:
data[categorical_columns].describe()

Unnamed: 0,VARIANT_NAME
count,10000
unique,2
top,variant
freq,5016


In [10]:
data.groupby(['VARIANT_NAME']).count()

Unnamed: 0_level_0,USER_ID,REVENUE
VARIANT_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1
control,4984,4984
variant,5016,5016


In [11]:
data.groupby(['VARIANT_NAME']).USER_ID.nunique()

VARIANT_NAME
control    3931
variant    3934
Name: USER_ID, dtype: int64

In [12]:
data.groupby(['VARIANT_NAME']).USER_ID.count()

VARIANT_NAME
control    4984
variant    5016
Name: USER_ID, dtype: int64

In [13]:
data.groupby(['USER_ID']).USER_ID.transform('count').fillna(-1)

0       2
1       3
2       2
3       2
4       1
5       1
6       2
7       1
8       3
9       3
10      1
11      4
12      1
13      4
14      1
15      3
16      1
17      3
18      3
19      2
20      1
21      3
22      2
23      2
24      1
25      3
26      3
27      2
28      2
29      2
       ..
9970    2
9971    3
9972    2
9973    1
9974    1
9975    5
9976    3
9977    1
9978    3
9979    1
9980    2
9981    1
9982    1
9983    1
9984    4
9985    2
9986    1
9987    2
9988    1
9989    4
9990    2
9991    1
9992    2
9993    2
9994    2
9995    3
9996    1
9997    3
9998    1
9999    1
Name: USER_ID, Length: 10000, dtype: int64

In [14]:
data.groupby(['USER_ID']).VARIANT_NAME.transform('nunique').fillna(-1)

0       1
1       2
2       2
3       2
4       1
5       1
6       2
7       1
8       2
9       1
10      1
11      1
12      1
13      2
14      1
15      2
16      1
17      2
18      2
19      1
20      1
21      2
22      1
23      1
24      1
25      2
26      1
27      2
28      1
29      2
       ..
9970    1
9971    2
9972    1
9973    1
9974    1
9975    2
9976    2
9977    1
9978    2
9979    1
9980    1
9981    1
9982    1
9983    1
9984    2
9985    1
9986    1
9987    1
9988    1
9989    2
9990    2
9991    1
9992    2
9993    1
9994    1
9995    2
9996    1
9997    2
9998    1
9999    1
Name: VARIANT_NAME, Length: 10000, dtype: int64

In [15]:
data.groupby(['USER_ID']).REVENUE.transform('sum').fillna(-1)

0       0.00
1       0.00
2       0.00
3       0.00
4       0.00
5       0.00
6       0.00
7       0.00
8       0.00
9       0.00
10      0.00
11      0.00
12      0.00
13      2.15
14      0.00
15      0.00
16      0.00
17      0.00
18      0.00
19      0.06
20      0.00
21      0.00
22      0.00
23      0.00
24      0.00
25      0.00
26      0.00
27      0.00
28      0.00
29      0.00
        ... 
9970    0.00
9971    0.00
9972    0.00
9973    0.00
9974    0.00
9975    0.00
9976    0.00
9977    1.01
9978    1.92
9979    0.00
9980    0.00
9981    0.00
9982    0.00
9983    0.00
9984    0.00
9985    0.00
9986    0.00
9987    0.00
9988    0.00
9989    0.00
9990    0.00
9991    0.00
9992    0.00
9993    0.00
9994    0.00
9995    0.00
9996    0.00
9997    0.00
9998    0.00
9999    0.00
Name: REVENUE, Length: 10000, dtype: float64

In [16]:
data_aggr = data.copy()
data_aggr['uid_count'] = data.groupby(['USER_ID']).USER_ID.transform('count').fillna(-1)
data_aggr['bucket_unique'] = data_aggr.groupby(['USER_ID']).VARIANT_NAME.transform('nunique').fillna(-1)
data_aggr['revenue_total'] = data_aggr.groupby(['USER_ID']).REVENUE.transform('sum').fillna(-1)

In [17]:
data_aggr.head()

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE,uid_count,bucket_unique,revenue_total
0,737,variant,0.0,2,1,0.0
1,2423,control,0.0,3,2,0.0
2,9411,control,0.0,2,2,0.0
3,7311,control,0.0,2,2,0.0
4,6174,variant,0.0,1,1,0.0


In [18]:
data_aggr[data_aggr['USER_ID'] == 2423]

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE,uid_count,bucket_unique,revenue_total
1,2423,control,0.0,3,2,0.0
5243,2423,control,0.0,3,2,0.0
8556,2423,variant,0.0,3,2,0.0


In [19]:
data_aggr[data_aggr['bucket_unique'] > 1].USER_ID.nunique()

1541

In [20]:
data_aggr[data_aggr['bucket_unique'] == 1].USER_ID.nunique()

4783

In [21]:
data_aggr[data_aggr['bucket_unique'] == 1].groupby('VARIANT_NAME').USER_ID.nunique()

VARIANT_NAME
control    2390
variant    2393
Name: USER_ID, dtype: int64

In [22]:
data_aggr[data_aggr['bucket_unique'] > 1].groupby('VARIANT_NAME').USER_ID.nunique()

VARIANT_NAME
control    1541
variant    1541
Name: USER_ID, dtype: int64

In [23]:
data_aggr.groupby('VARIANT_NAME').USER_ID.nunique()

VARIANT_NAME
control    3931
variant    3934
Name: USER_ID, dtype: int64

**Вывод первый**: в эксперименте часть пользователей попала в 2 группы - это возможнно при техническом сбое при выдаче uid'ов (например, bucket выдаётся по coockie, а она может слетать); второй вариант - исходно неверно поставленный механизм выдачи bucket'а - если говорить проо социальную сеть, то логично выдавать bucket залогированному пользователю (дейтсвия покупки доступны тем, кто залогирован), если же выдавать bucket по coockie, то пользователю, залогировавшемуся с разных браузерах, будут показываться разные версии сайта;

__<i>Прим.</i>__: Таких наблюдений (с пользователями, попавшими сразу в 2 группы) в эксперименте - 39%, что является большой долей. В таком случае мы не можем учитывать их поведение, поскольку выборки будут связанными. 
Следует отметить, что у нас возможен вариант дизайна эксперимента таким образом, что пользователь будет видеть сначала одну, затем другую версию сайта, тогда при рассчёте результатов нужно будет использовать критерии Уилкоксона для связных выборок (scipy.stats.wilcoxon), t-критерий для связных выборок (scipy.stats.ttest_rel), критерии Мак-Немара или Фишера, а также, если мы исследуем влияние в режиме "до-после", мы можем воспользоваться библиотекой CausalImpact для оценки влияния на экспериментальную группу, как на временной ряд, и взяв контрольную группу в качестве ряда-предиктора (если в группах ннаблюдаются корреляции).

Пример 1 - script.R для 2-х регионов:

```R
library(CausalImpact)

regions <- read.csv2("~/Regions.csv", sep=',')
# y - исследуемый ряд с импактом (период импакта - с 10.09 по 25.11)
# x - ряд-предиктор
# t - даты
data <- zoo(cbind(regions$y, regions$x1), regions$t)
pre.period <- c(1, 322)
post.period <- c(323, 329)

impact <- CausalImpact(data, pre.period, post.period)
plot(impact)

write.csv(cbind(regions$y[1:329], impact$series$point.pred[1:329]), file = "Regions_Model.csv")

summary(impact)


library(ggplot2)
library(tidyr)

test_data <-
  data.frame(
    y_pred = impact$series$point.pred,
    y = regions$y,
    t = regions$Period
  )
test_data %>%
  gather(key,value, y_pred, y) %>%
  ggplot(aes(x=date, y=value, colour=key)) +
  geom_line()
```


Пример 2 - script.R c датами:
```R
library(CausalImpact)

regions <- read.csv2("~/Regions.csv", sep=';')
regions$Period <- as.Date(regions$Period, format = "%Y-%m-%d")
data <- zoo(cbind(regions$SPB, regions$MSK), regions$Period)

pre.period <- as.Date(c("2018-01-01", "2018-08-31"))
post.period <- as.Date(c("2018-09-01", "2018-11-30"))

impact <- CausalImpact(data, pre.period, post.period)
plot(impact)
summary(impact)

write.csv2(zoo(cbind(regions$SPB, coredata(impact$series$point.pred)), index(impact$series$point.pred)), file = "Regions_Model.csv")
```


В данном случае будем считать, что мы имеем дело с классическим вариантом A/B и не будем учитывать пользователей, которые попали в 2 группы, считая такую ситуацию ошибочной.

In [24]:
data_clear = data_aggr[data_aggr['bucket_unique'] == 1]

In [25]:
data_clear.head()

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE,uid_count,bucket_unique,revenue_total
0,737,variant,0.0,2,1,0.0
4,6174,variant,0.0,1,1,0.0
5,2380,variant,0.0,1,1,0.0
7,9168,control,0.0,1,1,0.0
9,7548,control,0.0,3,1,0.0


In [26]:
data_clear['index'] = data_clear.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [27]:
data_clear['index']

0          0
4          4
5          5
7          7
9          9
10        10
11        11
12        12
14        14
16        16
19        19
20        20
22        22
23        23
24        24
26        26
28        28
30        30
32        32
36        36
38        38
39        39
41        41
42        42
44        44
49        49
52        52
53        53
54        54
55        55
        ... 
9949    9949
9951    9951
9952    9952
9953    9953
9960    9960
9962    9962
9964    9964
9966    9966
9967    9967
9968    9968
9970    9970
9972    9972
9973    9973
9974    9974
9977    9977
9979    9979
9980    9980
9981    9981
9982    9982
9983    9983
9985    9985
9986    9986
9987    9987
9988    9988
9991    9991
9993    9993
9994    9994
9996    9996
9998    9998
9999    9999
Name: index, Length: 6070, dtype: int64

In [28]:
data_clear['action'] = data_clear.apply(lambda x: 1 if x['REVENUE'] > 0 else 0, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Предварительный анализ данных

## Визуализация данных по выручке 

#### 1. С каждой транзакции (считая, что запись в таблице несет смысл транзакции)

In [29]:
# data_clear.sort_values(by=['index', 'VARIANT_NAME']).groupby(['index', 'VARIANT_NAME', 'action'])[['USER_ID']].count().reset_index()
data_graph = data_clear.sort_values(by=['index', 'VARIANT_NAME']).groupby(['index', 'VARIANT_NAME'])[['REVENUE']].sum().reset_index()

In [30]:
data_graph

Unnamed: 0,index,VARIANT_NAME,REVENUE
0,0,variant,0.00
1,4,variant,0.00
2,5,variant,0.00
3,7,control,0.00
4,9,control,0.00
5,10,control,0.00
6,11,control,0.00
7,12,control,0.00
8,14,variant,0.00
9,16,variant,0.00


In [31]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace_high = go.Scatter(
                x=data_graph[data_graph['VARIANT_NAME']=='control'].index,
                y=data_graph[data_graph['VARIANT_NAME']=='control'].REVENUE,
                name = "Group Control",
                line = dict(color = '#17BECF'),
                opacity = 0.8)

trace_low = go.Scatter(
                x=data_graph[data_graph['VARIANT_NAME']=='variant'].index,
                y=data_graph[data_graph['VARIANT_NAME']=='variant'].REVENUE,
                name = "Group Variant",
                line = dict(color = 'rgb(148, 103, 189)'),
                opacity = 0.8,)
#                 yaxis='y2')




data_chart = [trace_high, trace_low]

layout = dict(
    title = "REVENUE",
    xaxis = dict(
        title='Days',
        range = [0,max(len(data_graph[data_graph['VARIANT_NAME']=='variant'].REVENUE), 
                       len(data_graph[data_graph['VARIANT_NAME']=='control'].REVENUE)
                      )]
    ),
    
    yaxis=dict(
        title='REVENUE'
    ),
)


fig = dict(data=data_chart, layout=layout)
iplot(fig, filename = "Bougth VAS")

#### 2. С каждого пользователя

In [32]:
data_graph = data_clear.sort_values(by=['USER_ID', 'VARIANT_NAME']).groupby(['USER_ID', 'VARIANT_NAME'])[['REVENUE']].sum().reset_index()

In [33]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace_high = go.Scatter(
                x=data_graph[data_graph['VARIANT_NAME']=='control'].USER_ID,
                y=data_graph[data_graph['VARIANT_NAME']=='control'].REVENUE,
                name = "Group Control",
                line = dict(color = '#17BECF'),
                opacity = 0.8)

trace_low = go.Scatter(
                x=data_graph[data_graph['VARIANT_NAME']=='variant'].USER_ID,
                y=data_graph[data_graph['VARIANT_NAME']=='variant'].REVENUE,
                name = "Group Variant",
                line = dict(color = 'rgb(148, 103, 189)'),
                opacity = 0.8,)
#                 yaxis='y2')




data_chart = [trace_high, trace_low]

layout = dict(
    title = "REVENUE",
    xaxis = dict(
        title='Days',
        range = [0,max(len(data_graph[data_graph['VARIANT_NAME']=='variant'].REVENUE), 
                       len(data_graph[data_graph['VARIANT_NAME']=='control'].REVENUE)
                      )]
    ),
    
    yaxis=dict(
        title='REVENUE'
    ),
)


fig = dict(data=data_chart, layout=layout)
iplot(fig, filename = "Bougth VAS")

### 3. Box-plot'ы и гистограмма по выручке с пользователя:

In [34]:
# data_graph = data_clear.sort_values(by=['VARIANT_NAME'])[data_clear['REVENUE'] > 0].groupby(['VARIANT_NAME'])[['USER_ID']].count().reset_index()
data_graph = data_clear.sort_values(by=['VARIANT_NAME'])[data_clear['REVENUE'] > 0].groupby(['USER_ID', 'VARIANT_NAME'])[['REVENUE']].sum().reset_index()


Boolean Series key will be reindexed to match DataFrame index.



In [35]:
data_graph.head()

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE
0,56,variant,2.99
1,124,control,1.25
2,169,control,4.33
3,282,control,18.56
4,443,variant,3.75


In [36]:
group_a = go.Box(x=data_graph[data_graph['VARIANT_NAME']=='control'].REVENUE, name = "Group A")
group_b = go.Box(x=data_graph[data_graph['VARIANT_NAME']=='variant'].REVENUE, name = "Group B")


layout = dict(
    title = "REVENUE",
    xaxis = dict(
        title='Days',
        range = [0,max(len(data_graph[data_graph['VARIANT_NAME']=='control'].REVENUE), 
                       len(data_graph[data_graph['VARIANT_NAME']=='variant'].REVENUE))]
    ),
    
    yaxis=dict(
        title='REVENUE'
    ),
)

data = [group_a, group_b]
iplot(data)

Наблюдаем подозрительную точку (назовём её super-транзакцию или power-транзакцией), уберём её (чуть ниже я скажу, что подобный выброс в некоторых случаях достоен более глубокого рассмотрения и исследования, а не просто выбрасывания со счетов).

In [39]:
group_a = go.Box(x=data_graph[(data_graph['VARIANT_NAME']=='control') & (data_graph['REVENUE']<196)].REVENUE, name = "Group A")
group_b = go.Box(x=data_graph[data_graph['VARIANT_NAME']=='variant'].REVENUE, name = "Group B")


layout = dict(
    title = "REVENUE",
    xaxis = dict(
        title='Days',
        range = [0,max(len(data_graph[(data_graph['VARIANT_NAME']=='control') & (data_graph['REVENUE']<196)].REVENUE), 
                       len(data_graph[data_graph['VARIANT_NAME']=='variant'].REVENUE))]
    ),
    
    yaxis=dict(
        title='REVENUE'
    ),
)

data = [group_a, group_b]
iplot(data)

**Вывод к п.3.1**: визуально группа B проигрывает.

In [41]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace1 = go.Histogram(
    x=data_graph[data_graph['VARIANT_NAME']=='control'].REVENUE,
    opacity=0.75, name = "Group A"
)
trace2 = go.Histogram(
    x=data_graph[data_graph['VARIANT_NAME']=='variant'].REVENUE,
    opacity=0.35, name = "Group B"
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Totals')

In [43]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace1 = go.Histogram(
    x=data_graph[(data_graph['VARIANT_NAME']=='control') & (data_graph['REVENUE']<196)].REVENUE,
    opacity=0.75, name = "Group A"
)
trace2 = go.Histogram(
    x=data_graph[data_graph['VARIANT_NAME']=='variant'].REVENUE,
    opacity=0.35, name = "Group B"
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Totals')

**Вывод к п.3.2**: визуально снова группа B проигрывает. Распределение имеет вид chi-square.

### Визуализация данных по кол-ву покупок
(платных транзаций);
index рассматриваем как транзакцию

In [44]:
data_graph = data_clear.sort_values(by=['index', 'VARIANT_NAME']).groupby(['index', 'VARIANT_NAME'])[['action']].sum().reset_index()

In [45]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace_high = go.Scatter(
                x=data_graph[data_graph['VARIANT_NAME']=='control'].index,
                y=data_graph[data_graph['VARIANT_NAME']=='control'].action,
                name = "Group Control",
                line = dict(color = '#17BECF'),
                opacity = 0.8)

trace_low = go.Scatter(
                x=data_graph[data_graph['VARIANT_NAME']=='variant'].index,
                y=data_graph[data_graph['VARIANT_NAME']=='variant'].action,
                name = "Group Variant",
                line = dict(color = 'rgb(148, 103, 189)'),
                opacity = 0.8,)
#                 yaxis='y2')




data_chart = [trace_high, trace_low]

layout = dict(
    title = "Action",
    xaxis = dict(
        title='Days',
        range = [0,max(len(data_graph[data_graph['VARIANT_NAME']=='variant'].action), 
                       len(data_graph[data_graph['VARIANT_NAME']=='control'].action)
                      )]
    ),
    
    yaxis=dict(
        title='Action'
    ),
)


fig = dict(data=data_chart, layout=layout)
iplot(fig, filename = "Bougth VAS")

### Box-plot'ы и гистограмма по платным транзакциям с пользователя:

In [46]:
# data_graph = data_clear.sort_values(by=['VARIANT_NAME'])[data_clear['REVENUE'] > 0].groupby(['USER_ID', 'VARIANT_NAME'])[['REVENUE']].sum().reset_index()
data_graph = data_clear.sort_values(by=['VARIANT_NAME']).groupby(['USER_ID', 'VARIANT_NAME'])[['action']].sum().reset_index()

In [47]:
group_a = go.Box(x=data_graph[data_graph['VARIANT_NAME']=='control'].action, name = "Group A")
group_b = go.Box(x=data_graph[data_graph['VARIANT_NAME']=='variant'].action, name = "Group B")


layout = dict(
    title = "REVENUE",
    xaxis = dict(
        title='Days',
        range = [0,max(len(data_graph[data_graph['VARIANT_NAME']=='control'].action), 
                       len(data_graph[data_graph['VARIANT_NAME']=='variant'].action))]
    ),
    
    yaxis=dict(
        title='REVENUE'
    ),
)

data = [group_a, group_b]
iplot(data)

In [48]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace1 = go.Histogram(
    x=data_graph[data_graph['VARIANT_NAME']=='control'].action,
    opacity=0.75, name = "Group A"
)
trace2 = go.Histogram(
    x=data_graph[data_graph['VARIANT_NAME']=='variant'].action,
    opacity=0.35, name = "Group B"
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Totals')

**Вывод к пункту**: визуально нельзя выделить проигравшую группу. Распределение имеет вид **биномиального**.

## 2. Суммарные данные:

In [49]:
data_clear.sort_values(by=['index', 'VARIANT_NAME']).groupby(['VARIANT_NAME'])[['action']].sum().reset_index()

Unnamed: 0,VARIANT_NAME,action
0,control,54
1,variant,43


In [50]:
data_clear.sort_values(by=['index', 'VARIANT_NAME']).groupby(['VARIANT_NAME'])[['REVENUE']].sum().reset_index()

Unnamed: 0,VARIANT_NAME,REVENUE
0,control,470.56
1,variant,179.32


In [51]:
data_clear[(data_clear['REVENUE'] > 0) & (data_clear['VARIANT_NAME'] == 'control')][['REVENUE', 'action']].describe()

Unnamed: 0,REVENUE,action
count,54.0,54.0
mean,8.714074,1.0
std,26.627446,0.0
min,0.02,1.0
25%,1.585,1.0
50%,3.12,1.0
75%,6.0075,1.0
max,196.01,1.0


In [52]:
data_clear[(data_clear['REVENUE'] > 0) & (data_clear['REVENUE'] < 196) & (data_clear['VARIANT_NAME'] == 'control')][['REVENUE', 'action']].describe()

Unnamed: 0,REVENUE,action
count,53.0,53.0
mean,5.180189,1.0
std,5.942771,0.0
min,0.02,1.0
25%,1.56,1.0
50%,2.99,1.0
75%,4.98,1.0
max,29.32,1.0


**Вывод второй**: в выборке присутствует выброс - кто-то заплатил 196 рублей.
Является ли это результатом совпадения ряда условий, 
пользователь является представителем определённого соц-дема или географического региона, сказать мы этого не можем.
Однако в дальнейшем, если бы наша выборка была больше и мы заметили бы несколько таких выбросов, 
или же при проведении схожего эксперимента мы бы также заметили такой выброс, 
то следовало бы посмотреть повнимательнее на срез, в котором такие покупки происходят - возможно, 
мы могли бы найти точки роста в группе пользователей, соовершающих наиболее дорогие покупки.

In [53]:
data_clear[(data_clear['REVENUE'] > 0) & (data_clear['VARIANT_NAME'] == 'variant')][['REVENUE', 'action']].describe()

Unnamed: 0,REVENUE,action
count,43.0,43.0
mean,4.170233,1.0
std,4.932851,0.0
min,0.02,1.0
25%,1.25,1.0
50%,2.72,1.0
75%,4.27,1.0
max,23.04,1.0


In [55]:
q005 = data_clear[(data_clear['REVENUE'] > 0) & (data_clear['VARIANT_NAME'] == 'control')].REVENUE.quantile(0.05)
q095 = data_clear[(data_clear['REVENUE'] > 0) & (data_clear['VARIANT_NAME'] == 'control')].REVENUE.quantile(0.95)

data_clear[(data_clear['REVENUE'] > q005) & (data_clear['REVENUE'] < q095) & (data_clear['VARIANT_NAME'] == 'control')][['REVENUE', 'action']].describe()

Unnamed: 0,REVENUE,action
count,48.0,48.0
mean,4.689167,1.0
std,4.473974,0.0
min,0.23,1.0
25%,1.75,1.0
50%,3.12,1.0
75%,4.9575,1.0
max,18.56,1.0


In [56]:
q005 = data_clear[(data_clear['REVENUE'] > 0) & (data_clear['VARIANT_NAME'] == 'variant')].REVENUE.quantile(0.05)
q095 = data_clear[(data_clear['REVENUE'] > 0) & (data_clear['VARIANT_NAME'] == 'variant')].REVENUE.quantile(0.95)

data_clear[(data_clear['REVENUE'] > q005) & (data_clear['REVENUE'] < q095) & (data_clear['VARIANT_NAME'] == 'variant')][['REVENUE', 'action']].describe()

Unnamed: 0,REVENUE,action
count,36.0,36.0
mean,3.075278,1.0
std,2.279854,0.0
min,0.5,1.0
25%,1.2875,1.0
50%,2.61,1.0
75%,3.8675,1.0
max,10.82,1.0


In [57]:
data_clear[(data_clear['REVENUE'] < 196)].sort_values(by=['index', 'VARIANT_NAME']).groupby(['VARIANT_NAME'])[['REVENUE']].sum().reset_index()

Unnamed: 0,VARIANT_NAME,REVENUE
0,control,274.55
1,variant,179.32


У нас биномиальное распределение. Здесь имеет смысл смотреть на 2 метрики: доход **REVENUE** и **кол-во покупок**, общее и по пользователям.
В выборке есть "выбросы" по REVENUE, поэтому для данной метрики имеет смысл в дальнейшем обрезать перцентили 0.05 и 0.95. 
Изначально проведём быстрый анализ и обрежем только выброс в 196 рублей, чтобы не выбрасывать много данные.

In [58]:
data_clear = data_clear[(data_clear['REVENUE'] < 196)]

In [59]:
data_clear.describe()

Unnamed: 0,USER_ID,REVENUE,uid_count,bucket_unique,revenue_total,index,action
count,6069.0,6069.0,6069.0,6069.0,6069.0,6069.0,6069.0
mean,4983.934586,0.074785,1.489372,1.0,0.120707,4977.591036,0.015818
std,2898.152264,0.907245,0.693762,0.0,1.162639,2882.622967,0.124782
min,2.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,2472.0,0.0,1.0,1.0,0.0,2490.0,0.0
50%,4951.0,0.0,1.0,1.0,0.0,4968.0,0.0
75%,7501.0,0.0,2.0,1.0,0.0,7454.0,0.0
max,9998.0,29.32,6.0,1.0,29.32,9999.0,1.0


Выше мы рассматривали поюзерные метрики;

Ещё один вариант - рассматривам по-bucket'ные метрики, или метрики на группы (мы можем применить к каждой из 2-х групп, экспериментальной и контрольной, метод sub-bucket'ов или его развитие - метод bootstrap'а, с целью привести распределение к нормальному виду и иметь возможность корректно применить t-test);

Работая с биномиальным распределением формата "купил - не купил", использовать напрямую t-test нельзя, ведь он работает с вещественными числами; можно случайным образом разбить имеющий набор данных на группы-bucket'ы, и затем воспользоваться трюком, вычислив долю в каждом sub-bucket'е (так называемую **ratio-метрику**), и затем уже доли по sub-bucket'ам использовать в t-test'е; 

Однако есть более доверенный и устойчивый подход для рассчёта стат. значимости **ratio-метрик** - это метод линеаризации, который преобразует ratio-метрику в поюзерную и позволяет применить t-test.

In [60]:
data_test_actions = data_clear.sort_values(by=['VARIANT_NAME']).groupby(['USER_ID', 'VARIANT_NAME'])[['action']].sum().reset_index()
data_test_revenue = data_clear.sort_values(by=['VARIANT_NAME']).groupby(['USER_ID', 'VARIANT_NAME'])[['REVENUE']].sum().reset_index()

In [61]:
data_test_actions.head()

Unnamed: 0,USER_ID,VARIANT_NAME,action
0,2,control,0
1,4,variant,0
2,5,variant,0
3,6,variant,0
4,9,variant,0


In [62]:
data_test_revenue.head()

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE
0,2,control,0.0
1,4,variant,0.0
2,5,variant,0.0
3,6,variant,0.0
4,9,variant,0.0


In [63]:
data_clear.head()

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE,uid_count,bucket_unique,revenue_total,index,action
0,737,variant,0.0,2,1,0.0,0,0
4,6174,variant,0.0,1,1,0.0,4,0
5,2380,variant,0.0,1,1,0.0,5,0
7,9168,control,0.0,1,1,0.0,7,0
9,7548,control,0.0,3,1,0.0,9,0


In [64]:
users_revenue = {'control': list(), 'variant': list()}
users_actions = {'control': list(), 'variant': list()}

for uid in data_clear.USER_ID.tolist():
    # print("user_id = {user_id}, revenue = {revenue}".format(user_id = uid, revenue = '\t'.join([str(x) for x in data_clear[data_clear['USER_ID'] == uid].REVENUE.tolist()])))
    bucket = data_clear[data_clear['USER_ID'] == uid]['VARIANT_NAME'].unique()
    if len(bucket) > 1:
        break
    else:
        bucket = bucket[0]
    revenue = data_clear[data_clear['USER_ID'] == uid].REVENUE.tolist()
    action = data_clear[data_clear['USER_ID'] == uid].action.tolist()
    users_revenue[bucket].append(revenue)
    users_actions[bucket].append(action)

In [65]:
users_revenue

{'control': [[0.0],
  [0.0, 0.0, 0.0],
  [0.0],
  [0.0, 0.0, 0.0, 0.0],
  [0.0],
  [0.0, 0.0],
  [0.0, 0.0, 0.0],
  [0.0, 0.0],
  [0.0, 0.0],
  [0.0, 0.0],
  [0.0, 0.0],
  [0.0, 0.0],
  [3.25, 0.0],
  [0.0],
  [0.0],
  [0.0, 0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0, 0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0, 0.0],
  [0.0],
  [0.0, 0.0, 0.0],
  [0.0],
  [0.0, 0.0],
  [0.0, 0.0],
  [0.0, 0.04],
  [0.0, 0.0],
  [0.0],
  [0.0, 0.0, 0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [2.99],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0, 0.0],
  [0.0],
  [0.0, 0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0, 0.0],
  [0.0],
  [0.0, 0.0],
  [0.0, 0.0],
  [0.0, 0.0],
  [0.0],
  [0.0],
  [0.0, 0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0, 0.0],
  [0.0],
  [0.0, 0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0, 0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0],
  [0.0, 0.0],
  [0.0, 0.0, 0.0],
  [0.0, 0.0],

In [66]:
# data_test_revenue[data_test_revenue['VARIANT_NAME'] == 'control'].groupby(['USER_ID']).REVENUE.sum().tolist()[10:20:5]

In [67]:
# data_test_revenue[data_test_revenue['VARIANT_NAME'] == 'variant'].groupby(['USER_ID'])[['action']].sum()

In [68]:
# data_test_actions[data_test_actions['VARIANT_NAME'] == 'control'].groupby(['USER_ID'])

In [69]:
# data_test_actions[data_test_actions['VARIANT_NAME'] == 'variant'].groupby(['USER_ID'])

## Проверяем равенство дисперсий:

Воспользуемся критерием Левена, который менее чувствителен к отклонению метрики в выборках от нормального распределения, чем критерий Барлетта.

In [72]:
stats.levene(data_clear['REVENUE'][data_clear['VARIANT_NAME'] == 'control'],
             data_clear['REVENUE'][data_clear['VARIANT_NAME'] == 'variant'])

LeveneResult(statistic=1.8703041860181113, pvalue=0.17149090193725022)

Для выручки по транзакциям: 

pvalue=0.17149090193725022 > 0.05 => результаты не статистически значимы, дисперсии в выборках однородны

In [73]:
stats.levene(data_clear[data_clear['VARIANT_NAME'] == 'control'].groupby('USER_ID').REVENUE.sum(),
             data_clear[data_clear['VARIANT_NAME'] == 'variant'].groupby('USER_ID').REVENUE.sum())

LeveneResult(statistic=1.8315496300786236, pvalue=0.17600861562570844)

Для **выручки** по пользователям: 

pvalue=0.176 > 0.05 => результаты не статистически значимы, дисперсии в выборках однородны

In [74]:
stats.levene(data_clear[data_clear['VARIANT_NAME'] == 'control'].groupby('USER_ID').action.sum(),
             data_clear[data_clear['VARIANT_NAME'] == 'variant'].groupby('USER_ID').action.sum())

LeveneResult(statistic=1.0574537484143178, pvalue=0.30384867663659315)

Для **кол-ву действий** по пользователям: 

pvalue=0.304 > 0.05 => результаты не статистически значимы, дисперсии в выборках однородны

**Проверку на нормальность** проводить избыточно, поскольку распределение заведомо биномиальное.
Просто для демонстрации методики приведу данный код:

In [169]:
def kstest_norm_test(df, colname = 'action', pvalue = 0.05):
    st = stats.kstest(df, 'norm')
    print(st)
    print(st[1], '\t', pvalue, st[1] < pvalue)
    if st[1] > pvalue:
        try:
            print('{} {} is NOT normal\n'.format('None' if df.name is None else df.name.upper(), colname.upper()))
        except Exception as e:
            print('NOT normal\n{}\n'.format(e))
    else:
        try:
            print('{} {} is normal\n'.format('None' if df.name is None else df.name.upper(), colname.upper()))
        except Exception as e:
            print('normal\n{}\n'.format(e))
            
            
def shapiro_norm_test(df, colname = 'action', pvalue = 0.05):
    st = stats.shapiro(df)
    print(st)
    print(st[1], '\t', pvalue, st[1] < pvalue)
    if st[1] > pvalue:
        try:
            print('{} {} is NOT normal\n'.format('None' if df.name is None else df.name.upper(), colname.upper()))
        except Exception as e:
            print('NOT normal\n{}\n'.format(e))
    else:
        try:
            print('{} {} is normal\n'.format('None' if df.name is None else df.name.upper(), colname.upper()))
        except Exception as e:
            print('normal\n{}\n'.format(e))

In [76]:
shapiro_norm_test(data_clear[(data_clear['VARIANT_NAME']=='control')].groupby('USER_ID').action.sum(), 
                  colname = 'action')
shapiro_norm_test(data_clear[(data_clear['VARIANT_NAME']=='variant')].groupby('USER_ID').action.sum(), 
                  colname = 'action')


kstest_norm_test(data_clear[(data_clear['VARIANT_NAME']=='control')].groupby('USER_ID').action.sum(), 
                  colname = 'action')
kstest_norm_test(data_clear[(data_clear['VARIANT_NAME']=='variant')].groupby('USER_ID').action.sum(), 
                  colname = 'action')

(0.12875598669052124, 0.0)
ACTION ACTION is NOT normal

(0.10889226198196411, 0.0)
ACTION ACTION is NOT normal

KstestResult(statistic=0.5, pvalue=0.0)
ACTION ACTION is NOT normal

KstestResult(statistic=0.5, pvalue=0.0)
ACTION ACTION is NOT normal



In [77]:
shapiro_norm_test(data_clear[(data_clear['VARIANT_NAME']=='control')].groupby('USER_ID').REVENUE.sum(), 
                  colname = 'REVENUE')
shapiro_norm_test(data_clear[(data_clear['VARIANT_NAME']=='variant')].groupby('USER_ID').REVENUE.sum(), 
                  colname = 'REVENUE')


kstest_norm_test(data_clear[(data_clear['VARIANT_NAME']=='control')].groupby('USER_ID').REVENUE.sum(), 
                  colname = 'REVENUE')
kstest_norm_test(data_clear[(data_clear['VARIANT_NAME']=='variant')].groupby('USER_ID').REVENUE.sum(), 
                  colname = 'REVENUE')

(0.07461178302764893, 0.0)
REVENUE REVENUE is NOT normal

(0.06145977973937988, 0.0)
REVENUE REVENUE is NOT normal

KstestResult(statistic=0.5, pvalue=0.0)
REVENUE REVENUE is NOT normal

KstestResult(statistic=0.5, pvalue=0.0)
REVENUE REVENUE is NOT normal



In [78]:
data_clear[(data_clear['VARIANT_NAME']=='control')].USER_ID.nunique()

2389

## Рассчёт стат. значимости эксперимента

И так, мы уже знаем, что экспериментальная группа явно проигрывает контрольной по выручке. 
Замечено и падение в экспериментальной группе платных транзакций, но не такое явное. 
Теперь необходимо определиться, является ли это падение стат.значимым (то есть какова вероятность, что мы не ошиблись и данные нашей выборки распространятся на всю генеральную совокупность).

Ниже я покажу несколько способов рассчёта стат.значимости для различных типов метрик, которые я использую на практике при рассчёте подобных эксперимента.

1) **Bootstrap** (или его упрощённый вариант - метод sub-bucket'ов, который менее устойчив к нормальному распределению);

2) **Метод линеаризации** (или разложение в ряд Тейлора в общем виде этой функции) для биномиального распределения и ratio-метрик, который также используется при рассчёте значимости изменения метрики длины сессии пользователя.

### T-Test:

In [209]:
def t_test(control, experiment):
    st, pval = stats.ttest_ind(control, experiment)
    print('ttest Statistic: %s\np-value: %s\nAvg: %s\nPercentiles: \n%s\n' % (st, pval, 
                                                                                pd.Series(control-experiment).mean(), 
                                                                                pd.Series(control-experiment).quantile(
                                                                                    [0., .01, .05, .1, .25, .5, .75, .9,
                                                                                    .95, .99, 1.0])))
    
    if ((pval>=0.05) and (st is not None)):
        print('ttest Same average')
    elif ((pval<0.05) and (st is not None)):
        print('ttest Different average')
    else:
        print('ttest is not applicapable')

### 1. Bootstrap

#### 1. На транзакцию

#### REVENUE

In [210]:
bs_data_a = bs.bootstrap(data_clear[(data_clear['VARIANT_NAME']=='control')].REVENUE.values, 
                                             stat_func=bs_stats.mean, 
             num_iterations=10000, iteration_batch_size=300, return_distribution=True)

bs_data_b = bs.bootstrap(data_clear[(data_clear['VARIANT_NAME']=='variant')].REVENUE.values, 
                                             stat_func=bs_stats.mean, 
             num_iterations=10000, iteration_batch_size=300, return_distribution=True)

In [211]:
pda = pd.DataFrame({'a': bs_data_a})
pdb = pd.DataFrame({'b': bs_data_b})

pda.name = 'Control'
pdb.name = 'Variant'

shapiro_norm_test(pda, 
                  colname = 'REVENUE')
shapiro_norm_test(pdb, 
                  colname = 'REVENUE')


kstest_norm_test(pda, 
                  colname = 'REVENUE')
kstest_norm_test(pdb, 
                  colname = 'REVENUE')

t_test(bs_data_a, bs_data_b)

(0.9941653609275818, 9.47542435639397e-20)
9.47542435639397e-20 	 0.05 True
CONTROL REVENUE is normal

(0.9921696782112122, 5.265910968525684e-23)
5.265910968525684e-23 	 0.05 True
VARIANT REVENUE is normal

KstestResult(statistic=0.56929847966608, pvalue=0.0)
0.0 	 0.05 True
CONTROL REVENUE is normal

KstestResult(statistic=0.5508771570735771, pvalue=0.0)
0.0 	 0.05 True
VARIANT REVENUE is normal

ttest Statistic: 138.09998225815013
p-value: 0.0
Avg: 0.032146342475103444
Percentiles: 
0.00   -0.055246
0.01   -0.021048
0.05   -0.005292
0.10    0.002859
0.25    0.016298
0.50    0.031789
0.75    0.047389
0.90    0.062468
0.95    0.070876
0.99    0.087698
1.00    0.131585
dtype: float64

ttest Different average


In [90]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace1 = go.Histogram(
    x=bs_data_a,
    opacity=0.75, name = "Group A"
)
trace2 = go.Histogram(
    x=bs_data_b,
    opacity=0.35, name = "Group B"
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='')

In [91]:
group_a = go.Box(x=bs_data_a, name = "Group A")
group_b = go.Box(x=bs_data_b, name = "Group B")

layout = dict(
    title = "Bougth VAS",
    xaxis = dict(
        title='Days',
        range = [0,max(len(bs_data_a), 
                       len(bs_data_b),)]
    ),
    
    yaxis=dict(
        title='Bougth VAS'
    ),
)

data = [group_a, group_b]
iplot(data)

Мы можем видеть, что после bootstrap'ирования распределения приняли ноормальный вид, а **выигрыш контрольной группы по выручке** стал ещё более очевиден.

#### По платным транзакциям

Здесь ннеобходимо использовать ratio-метрики.

In [98]:
bs_data_a = bs.bootstrap(data_clear[(data_clear['VARIANT_NAME']=='control')].action.values, 
                                             stat_func=bs_stats.mean, 
             num_iterations=10000, iteration_batch_size=300, return_distribution=True)

bs_data_b = bs.bootstrap(data_clear[(data_clear['VARIANT_NAME']=='variant')].action.values, 
                                             stat_func=bs_stats.mean, 
             num_iterations=10000, iteration_batch_size=300, return_distribution=True)

In [99]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace1 = go.Histogram(
    x=bs_data_a,
    opacity=0.75, name = "Group A"
)
trace2 = go.Histogram(
    x=bs_data_b,
    opacity=0.35, name = "Group B"
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='')

In [100]:
group_a = go.Box(x=bs_data_a, name = "Group A")
group_b = go.Box(x=bs_data_b, name = "Group B")

layout = dict(
    title = "Bougth VAS",
    xaxis = dict(
        title='Days',
        range = [0,max(len(bs_data_a), 
                       len(bs_data_b),)]
    ),
    
    yaxis=dict(
        title='Bougth VAS'
    ),
)

data = [group_a, group_b]
iplot(data)

Здесь мы наблюдаем метрику на транзакцию, ниже посмотрим метрики на пользователя:

#### 2. На пользователя

#### REVENUE

In [202]:
bs_data_a = bs.bootstrap(data_clear[(data_clear['VARIANT_NAME']=='control')].groupby('USER_ID').REVENUE.sum().values, 
                                             stat_func=bs_stats.mean, 
             num_iterations=10000, iteration_batch_size=300, return_distribution=True)

bs_data_b = bs.bootstrap(data_clear[(data_clear['VARIANT_NAME']=='variant')].groupby('USER_ID').REVENUE.sum().values, 
                                             stat_func=bs_stats.mean, 
             num_iterations=10000, iteration_batch_size=300, return_distribution=True)


In [208]:
pda = pd.DataFrame({'a': bs_data_a})
pdb = pd.DataFrame({'b': bs_data_b})

pda.name = 'Control'
pdb.name = 'Variant'

shapiro_norm_test(pda, 
                  colname = 'REVENUE')
shapiro_norm_test(pdb, 
                  colname = 'REVENUE')


kstest_norm_test(pda, 
                  colname = 'REVENUE')
kstest_norm_test(pdb, 
                  colname = 'REVENUE')

t_test(bs_data_a, bs_data_b)

(0.9975084066390991, 7.849268977844215e-12)
7.849268977844215e-12 	 0.05 True
CONTROL REVENUE is normal

(0.9964219331741333, 6.285504546856523e-15)
6.285504546856523e-15 	 0.05 True
VARIANT REVENUE is normal

KstestResult(statistic=0.514024381675617, pvalue=0.0)
0.0 	 0.05 True
CONTROL REVENUE is normal

KstestResult(statistic=0.5116681895242119, pvalue=0.0)
0.0 	 0.05 True
VARIANT REVENUE is normal

ttest Statistic: 102.98809183116798
p-value: 0.0
Avg: 0.004160055796197819
Percentiles: 
0.00   -0.011676
0.01   -0.004987
0.05   -0.002479
0.10   -0.001216
0.25    0.001292
0.50    0.004215
0.75    0.006730
0.90    0.009235
0.95    0.010906
0.99    0.013417
1.00    0.019699
dtype: float64

ttest Different average


Итак, у нас есть статистически значимая разница в выручке на пользователя. Контрольная группа выигрывает.

In [108]:
group_a = go.Box(x=bs_data_a, name = "Group A")
group_b = go.Box(x=bs_data_b, name = "Group B")

layout = dict(
    title = "Bougth VAS",
    xaxis = dict(
        title='Days',
        range = [0,max(len(bs_data_a), 
                       len(bs_data_b),)]
    ),
    
    yaxis=dict(
        title='Bougth VAS'
    ),
)

data = [group_a, group_b]
iplot(data)

In [204]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace1 = go.Histogram(
    x=bs_data_a,
    opacity=0.75, name = "Group A"
)
trace2 = go.Histogram(
    x=bs_data_b,
    opacity=0.35, name = "Group B"
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='')

#### По платным транзакциям на пользователя

In [205]:
bs_data_a = bs.bootstrap(data_clear[(data_clear['VARIANT_NAME']=='control')].groupby('USER_ID').action.sum().values, 
                                             stat_func=bs_stats.mean, 
             num_iterations=10000, iteration_batch_size=300, return_distribution=True)

bs_data_b = bs.bootstrap(data_clear[(data_clear['VARIANT_NAME']=='variant')].groupby('USER_ID').action.sum().values, 
                                             stat_func=bs_stats.mean, 
             num_iterations=10000, iteration_batch_size=300, return_distribution=True)

In [206]:
pda = pd.DataFrame({'a': bs_data_a})
pdb = pd.DataFrame({'b': bs_data_b})

pda.name = 'Control'
pdb.name = 'Variant'

shapiro_norm_test(pda, 
                  colname = 'action')
shapiro_norm_test(pdb, 
                  colname = 'action')


kstest_norm_test(pda, 
                  colname = 'action')
kstest_norm_test(pdb, 
                  colname = 'action')

t_test(bs_data_a, bs_data_b)

(0.9975084066390991, 7.849268977844215e-12)
7.849268977844215e-12 	 0.05 True
CONTROL ACTION is normal

(0.9964219331741333, 6.285504546856523e-15)
6.285504546856523e-15 	 0.05 True
VARIANT ACTION is normal

KstestResult(statistic=0.514024381675617, pvalue=0.0)
0.0 	 0.05 True
CONTROL ACTION is normal

KstestResult(statistic=0.5116681895242119, pvalue=0.0)
0.0 	 0.05 True
VARIANT ACTION is normal

ttest Statistic: 102.98809183116798
p-value: 0.0
Avg: 0.004160055796197819
Percentiles: 0.00   -0.011676
0.01   -0.004987
0.05   -0.002479
0.10   -0.001216
0.25    0.001292
0.50    0.004215
0.75    0.006730
0.90    0.009235
0.95    0.010906
0.99    0.013417
1.00    0.019699
dtype: float64

ttest Different average


In [111]:
group_a = go.Box(x=bs_data_a, name = "Group A")
group_b = go.Box(x=bs_data_b, name = "Group B")

layout = dict(
    title = "Bougth VAS",
    xaxis = dict(
        title='Days',
        range = [0,max(len(bs_data_a), 
                       len(bs_data_b),)]
    ),
    
    yaxis=dict(
        title='Bougth VAS'
    ),
)

data = [group_a, group_b]
iplot(data)

In [112]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace1 = go.Histogram(
    x=bs_data_a,
    opacity=0.75, name = "Group A"
)
trace2 = go.Histogram(
    x=bs_data_b,
    opacity=0.35, name = "Group B"
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='')

#### Доля платных транзакций

In [115]:
ratio_control = data_clear[(data_clear['VARIANT_NAME']=='control')].groupby('USER_ID').action.sum()/data_clear[(data_clear['VARIANT_NAME']=='control')].groupby('USER_ID').action.count()
ratio_variant = data_clear[(data_clear['VARIANT_NAME']=='variant')].groupby('USER_ID').action.sum()/data_clear[(data_clear['VARIANT_NAME']=='variant')].groupby('USER_ID').action.count()

In [153]:
ratio_control[(ratio_control > 0) & (ratio_control < 1)]

USER_ID
124     0.333333
169     0.500000
282     0.500000
487     0.500000
868     0.500000
1035    0.500000
1053    0.250000
1883    0.500000
1939    0.500000
2166    0.500000
2181    0.500000
3699    0.500000
4918    0.333333
5134    0.500000
5900    0.500000
6207    0.500000
6271    0.500000
6429    0.500000
7613    0.500000
7930    0.500000
8020    0.500000
8583    0.500000
8920    0.500000
9766    0.500000
Name: action, dtype: float64

In [154]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace1 = go.Histogram(
    x=ratio_control,
    opacity=0.75, name = "Group A", nbinsx = 100 
)
trace2 = go.Histogram(
    x=ratio_variant,
    opacity=0.35, name = "Group B", nbinsx = 100 
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='')

In [161]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
import plotly.graph_objs as go

trace1 = go.Histogram(
    x=ratio_control[(ratio_control > 0)],
    opacity=0.75, name = "Group A", nbinsx = 10 
)
trace2 = go.Histogram(
    x=ratio_variant[(ratio_variant > 0)],
    opacity=0.35, name = "Group B", nbinsx = 10 
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='')

In [163]:
group_a = go.Box(x=ratio_control, name = "Group A")
group_b = go.Box(x=ratio_variant, name = "Group B")

layout = dict(
    title = "Bougth VAS",
    xaxis = dict(
        title='Days',
        range = [0,max(len(ratio_control), 
                       len(ratio_variant),)]
    ),
    
    yaxis=dict(
        title='Bougth VAS'
    ),
)

data = [group_a, group_b]
iplot(data)

In [162]:
group_a = go.Box(x=ratio_control[(ratio_control > 0)], name = "Group A")
group_b = go.Box(x=ratio_variant[(ratio_variant > 0)], name = "Group B")

layout = dict(
    title = "Bougth VAS",
    xaxis = dict(
        title='Days',
        range = [0,max(len(ratio_control[(ratio_control > 0)]), 
                       len(ratio_variant[(ratio_variant > 0)]),)]
    ),
    
    yaxis=dict(
        title='Bougth VAS'
    ),
)

data = [group_a, group_b]
iplot(data)

In [166]:
group_a = go.Box(x=ratio_control[(ratio_control > 0) & (ratio_control < 1)], name = "Group A")
group_b = go.Box(x=ratio_variant[(ratio_variant > 0) & (ratio_variant < 1)], name = "Group B")

layout = dict(
    title = "Bougth VAS",
    xaxis = dict(
        title='Days',
        range = [0,max(len(ratio_control[(ratio_control > 0) & (ratio_control < 1)]), 
                       len(ratio_variant[(ratio_variant > 0) & (ratio_variant < 1)]),)]
    ),
    
    yaxis=dict(
        title='Bougth VAS'
    ),
)

data = [group_a, group_b]
iplot(data)

In [141]:
print("**Control**: \nRatio Median = {rcm}, \nRatio Mean = {rca}, \n\n"
       "**Variant**: \nRatio Median = {rvm}, \nRatio Mean = {rva}, \n".format(rcm = ratio_control.median(), \
                                                                        rca = ratio_control.mean(), \
                                                                        rvm = ratio_variant.median(), \
                                                                        rva = ratio_variant.mean()))

**Control**: 
Ratio Median = 0.0, 
Ratio Mean = 0.016917817775917398, 

**Variant**: 
Ratio Median = 0.0, 
Ratio Mean = 0.014222036495333611, 



In [123]:
ratio_control.describe()

count    2389.000000
mean        0.016918
std         0.119061
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: action, dtype: float64

In [121]:
ratio_variant.describe()

count    2393.000000
mean        0.014222
std         0.113152
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: action, dtype: float64

#### Встроенный инструмент bootstrap + t_test:

In [212]:
bs_ab_estims = bs.bootstrap_ab(data_clear[(data_clear['VARIANT_NAME']=='control')].REVENUE.values, 
                               data_clear[(data_clear['VARIANT_NAME']=='variant')].REVENUE.values, 
                                   bs_stats.mean, 
                                   bs_compare.difference, num_iterations=100, alpha=0.05, 
                                   iteration_batch_size=100, scale_test_by=1, num_threads=4)

In [215]:
bs_ab_estims

0.03185100074934027    (-0.01855250838935281, 0.08631908034773735)

In [216]:
bs_ab_estims = bs.bootstrap_ab(data_clear[(data_clear['VARIANT_NAME']=='control')].action.values, 
                               data_clear[(data_clear['VARIANT_NAME']=='variant')].action.values, 
                                   bs_stats.mean, 
                                   bs_compare.difference, num_iterations=100, alpha=0.05, 
                                   iteration_batch_size=100, scale_test_by=1, num_threads=4)

In [217]:
bs_ab_estims

0.0033945113541338588    (-0.0030527823329459906, 0.010654364092483787)

Мы видим, что результат подтверждается - контрольная группа лидирует, p-value < 0.05 => данное лидерство статистически значимо.

## 2. Линеаризация для поюзерных метрик
подход используется для рассчёта стат.значимости при биномиальном распределении 
и при работе с данными о времени сессии (когда целесообразно преобразовывать метрики к т.н. ratio-метрикам).

In [218]:
users_revenue = {'control': list(), 'variant': list()}
users_actions = {'control': list(), 'variant': list()}

for uid in data_clear.USER_ID.tolist():
    # print("user_id = {user_id}, revenue = {revenue}".format(user_id = uid, revenue = '\t'.join([str(x) for x in data_clear[data_clear['USER_ID'] == uid].REVENUE.tolist()])))
    bucket = data_clear[data_clear['USER_ID'] == uid]['VARIANT_NAME'].unique()
    if len(bucket) > 1:
        break
    else:
        bucket = bucket[0]
    revenue = data_clear[data_clear['USER_ID'] == uid].REVENUE.tolist()
    action = data_clear[data_clear['USER_ID'] == uid].action.tolist()
    users_revenue[bucket].append(revenue)
    users_actions[bucket].append(action)

def linearization(control, experiment):
    numerator = 0
    denominator = 0
    for row in control:
        numerator += sum(row)
        denominator += len(row)

    control_mean = numerator / denominator
    new_control = [sum(row) - len(row) * control_mean for row in control]
    new_experiment = [sum(row) - len(row) * control_mean for row in experiment]
    return new_control, new_experiment

### Рассчёт тестов стат.значимости для линеаризованных метрик

In [238]:
def class_balancer(df1, df2, target = 'action', alpha_value=0.05):
    df1_size = len(df1)
    df2_size = len(df2)
    total_size = df1_size + df2_size
    chi2_stat, pvalue_stat, dof, expected = chi2_contingency([[df1_size, total_size], [df2_size, total_size]])
    
    df1_target, df2_target = None, None

    if pvalue_stat > alpha_value:
        print('Ok')
        sample_size = min([df1_size, df2_size])
        df1_rnd_indx = np.random.choice(df1_size, size=sample_size, replace=False)
        df2_rnd_indx = np.random.choice(df2_size, size=sample_size, replace=False)
        df1_target = (np.asarray(df1)[df1_rnd_indx]).tolist()
        df2_target = (np.asarray(df2)[df2_rnd_indx]).tolist()
        
    return df1_target, df2_target

experiment_new, control_new = class_balancer(users_revenue['control'], users_revenue['variant'], target = 'revenue')
t_test(np.array(control_new), np.array(experiment_new))

Ok


In [220]:
from scipy import stats
import numpy as np

# Рассчёт тестов стат.значимости:
# def t_test(control, experiment):
#     st, pval = stats.ttest_ind(control, experiment)
#     print('ttest Statistic: %s\\tp-val: %s' % (st, pval))
#     if ((pval>=0.05) and (st is not None)):
#         print('ttest Same average')
#     elif ((pval<0.05) and (st is not None)):
#         print('ttest Different average')
#     else:
#         print('ttest is not applicapable')

# def mannwhitneyu(control, experiment):
#     st,pval = stats.mannwhitneyu(control, experiment)
#     print('mann_whitney Statistic: %s\\tp-val: %s' % (st, pval))
#     if ((pval>=0.05) and (st is not None)):
#         print('mann_whitney Same average')
#     elif ((pval<0.05) and (st is not None)):
#         print('mann_whitney Different average')
#     else:
#         print('mann_whitney is not applicapable')


        
# Линеаризация
control, experiment = linearization(users_revenue['control'], users_revenue['variant'])
t_test(np.array(control), np.array(experiment))

control, experiment = linearization(users_actions['control'], users_actions['variant'])
t_test(control, experiment)

ValueError: operands could not be broadcast together with shapes (3025,) (3044,) 

## 3. Критерий хи-квадрат для биномиального распределения:

In [None]:
from scipy.stats import chi2_contingency

experiment = data_clear[(data_clear['VARIANT_NAME']=='variant') & (data_clear['action'] > 1)].action
control = data_clear[(data_clear['VARIANT_NAME']=='control') & (data_clear['action'] > 1)].action
total = data_clear[(data_clear['VARIANT_NAME']=='control')].action
chi2, p_value, dof, expected = chi2_contingency([[experiment], [control]])


**Вывод третий**: суммарный доход и количество действий в контрольной группе 

Поскольку в выборке присутствовало 39% пользователей, попавших в обе группы - и тестовую, и контрольную, - в идеале тест следовало бы провести повторно, устранив техническую ошибку (см. описание выше, в первом выводе).