In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats
from statsmodels.stats.meta_analysis import effectsize_smd
from statsmodels.stats import proportion
from statsmodels.stats.power import tt_ind_solve_power
from statsmodels.stats.power import zt_ind_solve_power
import plotly.express as px

In [2]:
df = pd.pandas.read_csv('data/gb_sem_9_hw.csv')
df


Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,gate_30,3,False,False
1,337,gate_30,38,True,False
2,377,gate_40,165,True,False
3,483,gate_40,1,False,False
4,488,gate_40,179,True,True
...,...,...,...,...,...
90184,9999441,gate_40,97,True,False
90185,9999479,gate_40,30,False,False
90186,9999710,gate_30,28,True,False
90187,9999768,gate_40,51,True,False


In [3]:
df.version.replace({'gate_30': 0, 'gate_40': 1}, inplace=True)
df

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,0,3,False,False
1,337,0,38,True,False
2,377,1,165,True,False
3,483,1,1,False,False
4,488,1,179,True,True
...,...,...,...,...,...
90184,9999441,1,97,True,False
90185,9999479,1,30,False,False
90186,9999710,0,28,True,False
90187,9999768,1,51,True,False


In [4]:
def continious_result(control: pd.DataFrame,
                      treatment: pd.DataFrame,
                      column: str,
                      n_iters: int = 10_000) -> pd.DataFrame:
    # Статистика по выборкам
    size = control.loc[:, column].shape[0]
    
    control_mean = control.loc[:, column].mean()
    treatment_mean = treatment.loc[:, column].mean()
    
    control_std = control.loc[:, column].std(ddof=1)
    treatment_std = treatment.loc[:, column].std(ddof=1)
    
    # Бутсрап
    booted_diff = []
    for _ in tqdm(range(n_iters)):
        control_sample = control.loc[:, column].sample(n=size, replace=True).values
        treatment_sample = treatment.loc[:, column].sample(n=size, replace=True).values
        booted_diff.append(np.mean(control_sample - treatment_sample))
    
    # Считаем статистику после бустрапа
    md_ci, std_ci = np.mean(booted_diff), np.std(booted_diff, ddof=1)
    left_ci, right_ci = np.percentile(booted_diff, [2.5, 97.5])
    p_value_ci = 2 * (1 - stats.norm.cdf(np.abs(md_ci / std_ci)))
    
    # Считаем мощность эксперимента
    effect_size, _ = effectsize_smd(mean1=treatment_mean, sd1=treatment_std, nobs1=size,
                                    mean2=control_mean, sd2=control_std, nobs2=size)
    power = tt_ind_solve_power(effect_size=effect_size,
                               nobs1=size,
                               alpha=.05,
                               power=None,
                               ratio=1)
    # Формируем отчёт 
    result = pd.DataFrame({'effect_size': effect_size,
                           'alpha': p_value_ci, 
                           'beta': (1-power),
                           'CI': f'[{np.round(left_ci, 3)}, {np.round(right_ci, 3)}]',
                           'difference': md_ci,},
                          index=[column]) 
    return result

In [5]:
def proportion_result(control: pd.DataFrame,
                      treatment: pd.DataFrame,
                      column: str,
                      n_iters: int = 10_000) -> pd.DataFrame:
    # Вероятность событий
    size = control.loc[:, column].shape[0]
    prop_control = control.loc[:, column].sum() / size
    prop_treatment = treatment.loc[:, column].sum() / size
    
    # Бутсрап
    booted_diff = []
    for _ in tqdm(range(n_iters)):
        control_sample = stats.bernoulli.rvs(p=prop_control, size=size)
        treatment_sample = stats.bernoulli.rvs(p=prop_treatment, size=size)
        booted_diff.append(np.mean(control_sample - treatment_sample))
    
    # Считаем статистику после бустрапа
    md_ci, std_ci = np.mean(booted_diff), np.std(booted_diff, ddof=1)
    left_ci, right_ci = np.percentile(booted_diff, [2.5, 97.5])
    p_value_ci = 2 * (1 - stats.norm.cdf(np.abs(md_ci / std_ci)))
    
    # Считаем мощность эксперимента
    effect_size = proportion.proportion_effectsize(prop_control, prop_treatment)
    
    power = zt_ind_solve_power(effect_size=effect_size,
                               nobs1=size,
                               alpha=.05,
                               power=None,
                               ratio=1)
    # Формируем отчёт 
    result = pd.DataFrame({'effect_size': effect_size,
                           'alpha': p_value_ci, 
                           'beta': (1-power),
                           'CI': f'[{np.round(left_ci, 3)}, {np.round(right_ci, 3)}]',
                           'difference': md_ci,},
                          index=[column]) 
    return result

In [6]:
gate_30 = df[df.version == 0].copy(deep=True)
gate_40 = df[df.version == 1].copy(deep=True)

In [7]:
gate_30.shape, gate_40.shape

((44700, 5), (45489, 5))

In [8]:
fig = px.histogram(df,
                   x='sum_gamerounds',
                   color = 'sum_gamerounds',
                   title='sumgamerounds',
                   marginal = 'box',
                   nbins = 100,
                   barmode='overlay')

fig.show()

In [9]:
continious_result(gate_30, gate_40, 'sum_gamerounds')
 

100%|██████████| 10000/10000 [00:43<00:00, 231.53it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
sum_gamerounds,-0.005915,0.368739,0.856725,"[-0.962, 4.124]",1.183167


In [10]:
proportion_result(gate_30, gate_40, 'retention_1')

100%|██████████| 10000/10000 [00:32<00:00, 303.03it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
retention_1,-0.003823,0.5726,0.911819,"[-0.009, 0.005]",-0.001884


In [11]:
proportion_result(gate_30, gate_40, 'retention_7')

100%|██████████| 10000/10000 [00:29<00:00, 341.54it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
retention_7,0.012776,0.059362,0.519844,"[-0.0, 0.01]",0.004938


**Вывод:**Во всех тестах проведеных выше нет статистически значимых различий так как alpha и beta во всех случаях больше номинальных значений и во всех случаях доверительный интервал проходит через 0.

In [12]:
for _ in range(100, 1001): 
    if df.shape[0] % _ == 0:
        print(_)

911


In [13]:
n_buckets = 911
data_2 = (df
 .sample(n=df.shape[0], replace=False)
 .reset_index(drop=True)
 .assign(bucket=list(range(n_buckets)) * int(df.shape[0] / n_buckets)))
data_2

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7,bucket
0,729468,1,5,False,False,0
1,2513183,1,11,True,True,1
2,9633115,0,37,True,False,2
3,475398,1,85,True,False,3
4,5333184,1,206,True,True,4
...,...,...,...,...,...,...
90184,2456019,1,116,True,True,906
90185,9247982,1,13,True,False,907
90186,2890348,0,34,False,False,908
90187,3081472,0,21,False,True,909


In [14]:
df.head()

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,0,3,False,False
1,337,0,38,True,False
2,377,1,165,True,False
3,483,1,1,False,False
4,488,1,179,True,True


In [15]:
data_bucket = data_2.groupby(['version','bucket'])['sum_gamerounds'].agg(mu=np.mean, std=np.std).reset_index()
data_bucket.head()

Unnamed: 0,version,bucket,mu,std
0,0,0,28.116279,35.446948
1,0,1,30.851064,39.212896
2,0,2,46.93617,89.359899
3,0,3,45.479167,120.480773
4,0,4,51.870968,78.884617


In [16]:
round(np.mean(data_2["sum_gamerounds"]), 5), round(np.mean(data_bucket["mu"]), 5)

(51.87246, 51.92053)

In [17]:
round(np.std(data_2["sum_gamerounds"]), 5), round(np.mean(data_bucket["std"]), 5)

(195.04978, 97.16434)

In [18]:
control_bucket = data_bucket[data_bucket.version == 0]
treatment_bucket = data_bucket[data_bucket.version == 1]
continious_result(control_bucket, treatment_bucket, 'mu')

100%|██████████| 10000/10000 [00:04<00:00, 2160.41it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
mu,-0.03778,0.415132,0.872928,"[-1.078, 3.976]",1.076396


**Вывод:**При проведение теста с применением бакетирования выяснилось что разнизницы в значениях alpha, beta, CI почти нет, а время затрачениое на тест уменьшилось в 8 раз. Атак же при использование пропорций метод бакетирования не используется.