In [2]:
from typing import Union
from tqdm import tqdm

import pandas as pd
import numpy as np
import plotly.express as px

from scipy import stats
from statsmodels.stats.meta_analysis import effectsize_smd
from statsmodels.stats import proportion
from statsmodels.stats.power import tt_ind_solve_power
from statsmodels.stats.power import zt_ind_solve_power

In [3]:
data = pd.read_csv('gb_sem_9_hw.csv')

In [4]:
data

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,gate_30,3,False,False
1,337,gate_30,38,True,False
2,377,gate_40,165,True,False
3,483,gate_40,1,False,False
4,488,gate_40,179,True,True
...,...,...,...,...,...
90184,9999441,gate_40,97,True,False
90185,9999479,gate_40,30,False,False
90186,9999710,gate_30,28,True,False
90187,9999768,gate_40,51,True,False


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90189 entries, 0 to 90188
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userid          90189 non-null  int64 
 1   version         90189 non-null  object
 2   sum_gamerounds  90189 non-null  int64 
 3   retention_1     90189 non-null  bool  
 4   retention_7     90189 non-null  bool  
dtypes: bool(2), int64(2), object(1)
memory usage: 2.2+ MB


In [6]:
# from operator import mul

# def convert_time(my_time: str):
#     factors = (1, 1/60)
#     time = sum(map(mul, map(float, my_time.split(':')), factors))
#     return round(time, 2)

# data.time = data.time.apply(convert_time)
data.version.replace({'gate_30': int(0), 'gate_40': int(1)}, inplace=True)
# data.retention_1.replace({'False': int(0), 'True': int(1)}, inplace=False)
# data.retention_7.replace({'False': int(0), 'True': int(1)}, inplace=False)

In [7]:
data

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,0,3,False,False
1,337,0,38,True,False
2,377,1,165,True,False
3,483,1,1,False,False
4,488,1,179,True,True
...,...,...,...,...,...
90184,9999441,1,97,True,False
90185,9999479,1,30,False,False
90186,9999710,0,28,True,False
90187,9999768,1,51,True,False


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90189 entries, 0 to 90188
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   userid          90189 non-null  int64
 1   version         90189 non-null  int64
 2   sum_gamerounds  90189 non-null  int64
 3   retention_1     90189 non-null  bool 
 4   retention_7     90189 non-null  bool 
dtypes: bool(2), int64(3)
memory usage: 2.2 MB


In [9]:
data.describe()

Unnamed: 0,userid,version,sum_gamerounds
count,90189.0,90189.0,90189.0
mean,4998412.0,0.504374,51.872457
std,2883286.0,0.499984,195.050858
min,116.0,0.0,0.0
25%,2512230.0,0.0,5.0
50%,4995815.0,1.0,16.0
75%,7496452.0,1.0,51.0
max,9999861.0,1.0,49854.0


In [10]:
data.version.equals(data.sum_gamerounds)

False

In [11]:
data.version.compare(data.sum_gamerounds).index

Int64Index([    0,     1,     2,     4,     5,     7,     8,     9,    10,
               12,
            ...
            90179, 90180, 90181, 90182, 90183, 90184, 90185, 90186, 90187,
            90188],
           dtype='int64', length=85463)

In [12]:
data.iloc[data.version.compare(data.sum_gamerounds).index, :]

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,0,3,False,False
1,337,0,38,True,False
2,377,1,165,True,False
4,488,1,179,True,True
5,540,1,187,True,True
...,...,...,...,...,...
90184,9999441,1,97,True,False
90185,9999479,1,30,False,False
90186,9999710,0,28,True,False
90187,9999768,1,51,True,False


In [13]:
# data_2 = data.drop(data.version.compare(data.sum_gamerounds).index).copy(deep=True)

In [14]:
# data_2.version.equals(data_2.sum_gamerounds)

In [15]:
# data_2.userid.value_counts()

In [16]:
# data_2 = data_2.loc[data_2.userid.isin(data_2.userid.value_counts()[data_2.userid.value_counts() == 1].index.values), :]

In [17]:
# data_2

In [18]:
def continious_result(control: pd.DataFrame,
                      treatment: pd.DataFrame,
                      column: str,
                      n_iters: int = 10_000) -> pd.DataFrame:
    # Статистика по выборкам
    size = control.loc[:, column].shape[0]
    
    control_mean = control.loc[:, column].mean()
    treatment_mean = treatment.loc[:, column].mean()
    
    control_std = control.loc[:, column].std(ddof=1)
    treatment_std = treatment.loc[:, column].std(ddof=1)
    
    # Бутсрап
    booted_diff = []
    for _ in tqdm(range(n_iters)):
        control_sample = control.loc[:, column].sample(n=size, replace=True).values
        treatment_sample = treatment.loc[:, column].sample(n=size, replace=True).values
        booted_diff.append(np.mean(control_sample - treatment_sample))
    
    # Считаем статистику после бустрапа
    md_ci, std_ci = np.mean(booted_diff), np.std(booted_diff, ddof=1)
    left_ci, right_ci = np.percentile(booted_diff, [2.5, 97.5])
    p_value_ci = 2 * (1 - stats.norm.cdf(np.abs(md_ci / std_ci)))
    
    # Считаем мощность эксперимента
    effect_size, _ = effectsize_smd(mean1=treatment_mean, sd1=treatment_std, nobs1=size,
                                    mean2=control_mean, sd2=control_std, nobs2=size)
    power = tt_ind_solve_power(effect_size=effect_size,
                               nobs1=size,
                               alpha=.05,
                               power=None,
                               ratio=1)
    # Формируем отчёт 
    result = pd.DataFrame({'effect_size': effect_size,
                           'alpha': p_value_ci, 
                           'beta': (1-power),
                           'CI': f'[{np.round(left_ci, 3)}, {np.round(right_ci, 3)}]',
                           'difference': md_ci,},
                          index=[column]) 
    return result

In [19]:
def proportion_result(control: pd.DataFrame,
                      treatment: pd.DataFrame,
                      column: str,
                      n_iters: int = 10_000) -> pd.DataFrame:
    # Вероятность событий
    size = control.loc[:, column].shape[0]
    prop_control = control.loc[:, column].sum() / size
    prop_treatment = treatment.loc[:, column].sum() / size
    
    # Бутсрап
    booted_diff = []
    for _ in tqdm(range(n_iters)):
        control_sample = stats.bernoulli.rvs(p=prop_control, size=size)
        treatment_sample = stats.bernoulli.rvs(p=prop_treatment, size=size)
        booted_diff.append(np.mean(control_sample - treatment_sample))
    
    # Считаем статистику после бустрапа
    md_ci, std_ci = np.mean(booted_diff), np.std(booted_diff, ddof=1)
    left_ci, right_ci = np.percentile(booted_diff, [2.5, 97.5])
    p_value_ci = 2 * (1 - stats.norm.cdf(np.abs(md_ci / std_ci)))
    
    # Считаем мощность эксперимента
    effect_size = proportion.proportion_effectsize(prop_control, prop_treatment)
    
    power = zt_ind_solve_power(effect_size=effect_size,
                               nobs1=size,
                               alpha=.05,
                               power=None,
                               ratio=1)
    # Формируем отчёт 
    result = pd.DataFrame({'effect_size': effect_size,
                           'alpha': p_value_ci, 
                           'beta': (1-power),
                           'CI': f'[{np.round(left_ci, 3)}, {np.round(right_ci, 3)}]',
                           'difference': md_ci,},
                          index=[column]) 
    return result

In [20]:
control = data[data.version == 0].copy(deep=True)
treatment = data[data.version == 1].copy(deep=True)

In [21]:
### Testing timespent
control.shape, treatment.shape

((44700, 5), (45489, 5))

In [None]:
fig = px.histogram(data,
                   x='sum_gamerounds',
                   color = 'version',
                   title='avg_site_visits_distribution',
                   marginal = 'box',
                   nbins = 100,
                   barmode='overlay')

fig.show()

In [23]:
continious_result(control, treatment, 'sum_gamerounds')

100%|██████████| 10000/10000 [00:35<00:00, 285.56it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
sum_gamerounds,-0.005915,0.395978,0.856725,"[-1.009, 4.084]",1.127746


In [24]:
### Bucket

In [25]:
for _ in range(100, 1001): 
    if data.shape[0] % _ == 0:
        print(_)

911


In [26]:
n_buckets = 911
data = (data
 .sample(n=data.shape[0], replace=False)
 .reset_index(drop=True)
 .assign(bucket=list(range(n_buckets)) * int(data.shape[0] / n_buckets)))

In [27]:
data.head()

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7,bucket
0,3668924,1,10,False,False,0
1,7731155,0,5,True,False,1
2,717162,0,185,True,True,2
3,8783797,0,6,False,False,3
4,8795481,0,137,False,True,4


In [28]:
bucketed_data = data.groupby(['version', 'bucket'])['sum_gamerounds'].agg(mu=np.mean, std=np.std).reset_index()
bucketed_data

Unnamed: 0,version,bucket,mu,std
0,0,0,43.843137,62.293940
1,0,1,51.560000,89.974636
2,0,2,52.961538,102.307871
3,0,3,67.142857,133.184142
4,0,4,25.488372,37.167813
...,...,...,...,...
1817,1,906,29.134615,52.471105
1818,1,907,51.020833,100.836183
1819,1,908,48.666667,98.243056
1820,1,909,74.755556,84.198941


In [29]:
# Сравним исходное выборочное среднее и среднее бакетных средних 
round(np.mean(data["sum_gamerounds"]), 5), round(np.mean(bucketed_data["mu"]), 5)

(51.87246, 51.95674)

In [30]:
round(np.std(data["sum_gamerounds"]), 5), round(np.mean(bucketed_data["std"]), 5)

(195.04978, 97.1147)

In [31]:
control_bucket = bucketed_data[bucketed_data.version == 0]
treatment_bucket = bucketed_data[bucketed_data.version == 1]
continious_result(control_bucket, treatment_bucket, 'mu')

100%|██████████| 10000/10000 [00:03<00:00, 2866.01it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
mu,-0.041501,0.371089,0.856526,"[-1.0, 4.687]",1.321708


In [None]:
### Testing converted

In [None]:
fig = px.histogram(data, x="converted",
                   color='sum_gamerounds', barmode='group',
                   height=400)
fig.show()

In [32]:
proportion_result(control, treatment, 'retention_1')

100%|██████████| 10000/10000 [00:32<00:00, 311.93it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
retention_1,-0.003823,0.563692,0.911819,"[-0.008, 0.005]",-0.001927


In [33]:
proportion_result(control, treatment, 'retention_7')

100%|██████████| 10000/10000 [00:25<00:00, 392.37it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
retention_7,0.012776,0.057205,0.519844,"[-0.0, 0.01]",0.004979
