In [50]:
import pandas as pd
import numpy as np
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt


In [51]:
data = pd.read_csv('ab_data-40121-ef1895.csv')
data


Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
...,...,...,...,...,...
294473,751197,2017-01-03 22:28:38.630509,control,old_page,0
294474,945152,2017-01-12 00:51:57.078372,control,old_page,0
294475,734608,2017-01-22 11:45:03.439544,control,old_page,0
294476,697314,2017-01-15 01:20:28.957438,control,old_page,0


In [52]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [53]:
# Пропусков нет


In [54]:
data.nunique()


user_id         290584
timestamp       294478
group                2
landing_page         2
converted            2
dtype: int64

In [55]:
# Не все user_id уникальны

In [56]:
data['user_id'].value_counts()


637561    2
821876    2
643869    2
938802    2
916765    2
         ..
710897    1
708848    1
665839    1
663790    1
630836    1
Name: user_id, Length: 290584, dtype: int64

In [57]:
# Есть user_id, которых по 2. Посмотрим на одного из них:

data[data['user_id'] == 637561]


Unnamed: 0,user_id,timestamp,group,landing_page,converted
85705,637561,2017-01-15 16:39:21.602921,treatment,new_page,0
276307,637561,2017-01-07 18:01:31.630446,control,new_page,0


In [58]:
data.groupby('group')['user_id'].nunique()


group
control      146195
treatment    146284
Name: user_id, dtype: int64

In [59]:
data.groupby('landing_page')['user_id'].nunique()


landing_page
new_page    146317
old_page    146265
Name: user_id, dtype: int64

In [60]:
data[data['group'] == 'control']['landing_page'].value_counts()


old_page    145274
new_page      1928
Name: landing_page, dtype: int64

In [61]:
data[data['group'] == 'treatment']['landing_page'].value_counts()


new_page    145311
old_page      1965
Name: landing_page, dtype: int64

### Так как мы работаем с распределением Бернули, будем использовать Z-критерий

# Доверительный интервал

In [62]:
data_new = data[(data['group'] == 'treatment') & ((data['landing_page'] == 'new_page'))]
data_new


Unnamed: 0,user_id,timestamp,group,landing_page,converted
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
6,679687,2017-01-19 03:26:46.940749,treatment,new_page,1
8,817355,2017-01-04 17:58:08.979471,treatment,new_page,1
9,839785,2017-01-15 18:11:06.610965,treatment,new_page,1
...,...,...,...,...,...
294462,677163,2017-01-03 19:41:51.902148,treatment,new_page,0
294465,925675,2017-01-07 20:38:26.346410,treatment,new_page,0
294468,643562,2017-01-02 19:20:05.460595,treatment,new_page,0
294472,822004,2017-01-04 03:36:46.071379,treatment,new_page,0


In [63]:
data_old = data[(data['group'] == 'control') & ((data['landing_page'] == 'old_page'))]
data_old


Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
5,936923,2017-01-10 15:20:49.083499,control,old_page,0
7,719014,2017-01-17 01:48:29.539573,control,old_page,0
...,...,...,...,...,...
294471,718310,2017-01-21 22:44:20.378320,control,old_page,0
294473,751197,2017-01-03 22:28:38.630509,control,old_page,0
294474,945152,2017-01-12 00:51:57.078372,control,old_page,0
294475,734608,2017-01-22 11:45:03.439544,control,old_page,0


In [64]:
# Проверим: нет ли в них одинаковых user-id
data_old['user_id'].value_counts()


790526    1
814759    1
849588    1
837298    1
798383    1
         ..
740412    1
828481    1
834626    1
822340    1
794628    1
Name: user_id, Length: 145274, dtype: int64

In [65]:
# Проверим: нет ли в них одинаковых user-id
data_new['user_id'].value_counts()


773192    2
788479    1
699391    1
713718    1
719861    1
         ..
678430    1
913102    1
762401    1
768546    1
790530    1
Name: user_id, Length: 145310, dtype: int64

In [66]:
data[data['user_id'] == 773192]


Unnamed: 0,user_id,timestamp,group,landing_page,converted
1899,773192,2017-01-09 05:37:58.781806,treatment,new_page,0
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [67]:
# Удалим его:
data_new = data_new[data_new['user_id'] != 773192]
data_new.shape


(145309, 5)

In [68]:
# Проверим на пересечение user-id:
result = list(set(data_old['user_id'].tolist()) & set(data_new['user_id'].tolist()))
result


[]

In [69]:
# Пересечений нет, можем их использовать как независимые выборки


In [70]:
clicks_old_page = data_old['converted'].tolist()
clicks_new_page = data_new['converted'].tolist()
print(len(clicks_old_page), len(clicks_new_page))


145274 145309


In [71]:
from statsmodels.stats.proportion import proportion_confint

interval_old_page = proportion_confint(sum(clicks_old_page), 
                                            len(clicks_old_page),
                                            method = 'wilson')
interval_new_page = proportion_confint(sum(clicks_new_page), 
                                            len(clicks_new_page),
                                            method = 'wilson')

print('95%% confidence interval for a click probability, banner a: [%f, %f]' % interval_old_page)
print('95%% confidence interval for a click probability, banner b [%f, %f]' % interval_new_page)


95% confidence interval for a click probability, banner a: [0.118723, 0.122070]
95% confidence interval for a click probability, banner b [0.117155, 0.120483]


Рассчитанные интервалы практически совпадают (новый входит в старый), поэтому мы не можем сделать вывод, что новый баннер лучше.


# Z-критерий

#### Наши гипотезы: 

$H_0\colon$ Наши баннеры по результату равны

$H_1\colon$ Результаты баннеров не равны


## Z-критерий для разности долей (независимые выборки)


In [72]:
import scipy


In [73]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)


In [74]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))


In [75]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)
    

In [76]:
print("95%% confidence interval for a difference between proportions: [%f, %f]" %\
      proportions_diff_confint_ind(clicks_old_page, clicks_new_page))


95% confidence interval for a difference between proportions: [-0.000782, 0.003937]


In [77]:
# Интервал очень узкий и ноль в него входит, можно сказать, что он практически нулевой. 
# Соответственно, cделать вывод, что новый баннер приносит больше кликов, не можем.


In [78]:
print("p-value: %f" % 
      proportions_diff_z_test(proportions_diff_z_stat_ind(clicks_old_page, clicks_new_page)))


p-value: 0.190114


In [79]:
print("p-value: %f" % 
      proportions_diff_z_test(proportions_diff_z_stat_ind(clicks_old_page, clicks_new_page), 'less'))


p-value: 0.904943


На основе полученных p-value мы не можем опровергнуть нашу нулевую гипотезу, что результаты старого и нового баннеры одинаковы

## Z-критерий для разности долей (связанные выборки)


In [80]:
user_id_l = data['user_id'].value_counts().index.tolist()
un_value = data['user_id'].value_counts().tolist()
print(len(user_id_l), len(un_value))


290584 290584


In [81]:
related_samples = []
unrelated_samples = []

for index, u_value in zip(user_id_l, un_value):
    if u_value == 2:
        related_samples.append(index)
    else:
        unrelated_samples.append(index)

print(len(related_samples), len(unrelated_samples))


3894 286690


In [82]:
data_related = data.loc[data['user_id'].isin(related_samples)]
data_related


Unnamed: 0,user_id,timestamp,group,landing_page,converted
22,767017,2017-01-12 22:58:14.991443,control,new_page,0
192,656468,2017-01-18 07:13:29.805052,treatment,new_page,1
226,773693,2017-01-23 18:05:45.167335,control,old_page,1
240,733976,2017-01-11 15:11:16.407599,control,new_page,0
246,704650,2017-01-04 19:10:52.655062,treatment,new_page,0
...,...,...,...,...,...
294308,905197,2017-01-03 06:56:47.488231,treatment,new_page,0
294309,787083,2017-01-17 00:15:20.950723,control,old_page,0
294328,641570,2017-01-09 21:59:27.695711,control,old_page,0
294331,689637,2017-01-13 11:34:28.339532,control,new_page,0


In [83]:
data_related[data_related['group'] == 'control']['landing_page'].value_counts()


old_page    1981
new_page    1928
Name: landing_page, dtype: int64

In [84]:
data_related[data_related['group'] == 'treatment']['landing_page'].value_counts()


old_page    1965
new_page    1914
Name: landing_page, dtype: int64

In [85]:
# четкого разделения нет, поэтому разделим данные по страницам: по новой и по старой:
data_related_old = data_related[data_related['landing_page'] == 'old_page']
data_related_new = data_related[data_related['landing_page'] == 'new_page']

print(len(data_related_old), len(data_related_new))


3946 3842


In [86]:
# Выделим лишь те user-id, у которых есть данные по кликам на новый и старый баннер


In [87]:
list_old_id = data_related_old['user_id'].tolist()
list_new_id = data_related_new['user_id'].tolist()

print(len(list_old_id), len(list_new_id))


3946 3842


In [88]:
result_intersection = list(set(list_old_id) & set(list_new_id))
len(result_intersection)


1998

In [89]:
data_old_in = data_related_old.loc[data_related_old['user_id'].isin(result_intersection)]
data_new_in = data_related_new.loc[data_related_new['user_id'].isin(result_intersection)]


In [90]:
l_old = data_old_in.sort_values(by='user_id')['converted'].tolist()
l_new = data_new_in.sort_values(by='user_id')['converted'].tolist()


In [91]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)


In [92]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )


In [93]:
print("95%% confidence interval for a difference between proportions: [%f, %f]" \
      % proportions_diff_confint_rel(l_old, l_new))


95% confidence interval for a difference between proportions: [-0.014802, 0.026814]


In [94]:
# Т.к. ноль входит в наш интервал, то мы не можем сказать о различии результатов кликов для наших баннеров


In [95]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_rel(l_old, l_new)))


p-value: 0.571577


In [96]:
# p-value слишком большое, соответственно, мы не можем отвергнуть нашу нулевую гипотезу


In [97]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_rel(l_old, l_new), 'less'))


p-value: 0.714212


In [98]:
# Для односторонней альтернативы мы тоже не можем опровергнуть нашу нулевую гипотезу, причем еще более уверенно, 
# так как p-value еще больше, чем для двусторонней альтеранативы.


# Вывод

Однозначно можно сказать, что новый баннер не принес увеличения кликов.
