In [9]:
import sys
import numpy as np
import pandas as pd
import scipy as sc
from scipy.stats import mannwhitneyu, zscore, ttest_ind, ttest_ind_from_stats, \
                        f_oneway, wilcoxon, kruskal, friedmanchisquare
from statsmodels.stats import weightstats as stests
import matplotlib.pyplot as plt

sys.path.append("../")

from src.ab.helpers import min_sample_size_avg
from src.ab.data import BernoulliDataGenerator
from bayesian_testing.experiments import BinaryDataTest, DeltaLognormalDataTest

%matplotlib inline

MDE = 0.02
beta = 0.2
alpha = 0.07

In [5]:
class LogNormalGenerator():
    def __init__(self,
                 mean,
                 std):
        self.mean = mean
        self.std = std
    def get_sample(self,
                   size):
        return np.random.lognormal(mean=self.mean, 
                                   sigma=self.std, 
                                   size=size)*1000

In [70]:
mu, sd = 0.6, 0.6
SIZE = 3000
generator_a = LogNormalGenerator(mean=mu,
                                 std=sd)
generator_b = LogNormalGenerator(mean=mu,
                                 std=sd*1.05)
ctr_gen = BernoulliDataGenerator(n_groups=2,
                                 ctrs=[0.2, 0.2])

In [71]:
revenue_a, revenue_b = generator_a.get_sample(size=SIZE), generator_b.get_sample(size=SIZE)
retro_data_ctr = ctr_gen.get_sample(update_size=len(revenue_a))
df_control = pd.DataFrame({'RTO': revenue_a,
                           'IS_VISIT': retro_data_ctr[0]})
df_experiment = pd.DataFrame({'RTO': revenue_b,
                              'IS_VISIT': retro_data_ctr[1]})
df_control['TARGET_GROUP'] = 'A'
df_experiment['TARGET_GROUP'] = 'B'
df = pd.concat([df_control, 
                df_experiment])
df.groupby('TARGET_GROUP')['RTO'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
TARGET_GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,3000.0,2211.23237,1470.728728,214.66238,1229.930364,1843.946044,2777.419598,15821.63745
B,3000.0,2192.081391,1445.510325,258.926386,1187.826706,1825.230724,2777.120167,11105.976532


In [72]:
df.groupby('TARGET_GROUP')['IS_VISIT'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
TARGET_GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,3000.0,0.203,0.4023,0.0,0.0,0.0,0.0,1.0
B,3000.0,0.189,0.391574,0.0,0.0,0.0,0.0,1.0


### Standard AB

In [73]:
a=df[df['TARGET_GROUP']=='A']['RTO'].values
b=df[df['TARGET_GROUP']=='B']['RTO'].values

ttest_ind(a=a, 
          b=b, 
          equal_var=True, alternative='two-sided')[1]

0.6110095583422286

### Bayesian AB

In [74]:
rev_test = DeltaLognormalDataTest()
rev_test.add_variant_data("A", a)
rev_test.add_variant_data("B", b)
rev_test.probabs_of_being_best()

{'A': 0.6593, 'B': 0.3407}