In [1]:
import pandas as pd 
import numpy as np
from scipy.stats import norm, ttest_ind
from datetime import datetime

In [2]:
def get_sample_size_abs(epsilon, std, alpha=0.05, beta=0.2):
    t_alpha = norm.ppf(1 - alpha / 2, loc=0, scale=1)
    t_beta = norm.ppf(1 - beta, loc=0, scale=1)
    z_scores_sum_squared = (t_alpha + t_beta) ** 2
    sample_size = int(
        np.ceil(
            z_scores_sum_squared * (2 * std ** 2) / (epsilon ** 2)
        )
    )
    return sample_size

def get_sample_size_arb(mu, std, eff=1.01, alpha=0.05, beta=0.2):
    epsilon = (eff - 1) * mu

    return get_sample_size_abs(epsilon, std=std, alpha=alpha, beta=beta)

def check_ttest(a, b, alpha=0.05):
    """Тест Стьюдента. Возвращает 1, если отличия значимы."""
    _, pvalue = ttest_ind(a, b)
    return int(pvalue < alpha)

def get_minimal_determinable_effect(std, sample_size, alpha=0.05, beta=0.2):
    t_alpha = norm.ppf(1 - alpha / 2, loc=0, scale=1)
    t_beta = norm.ppf(1 - beta, loc=0, scale=1)
    disp_sum_sqrt = (2 * (std ** 2)) ** 0.5
    mde = (t_alpha + t_beta) * disp_sum_sqrt / np.sqrt(sample_size)
    return mde

In [5]:
df_sales = pd.read_csv('2022-04-01T12_df_sales.csv', parse_dates = ['date']).sort_values('date')

## Task 1

In [15]:
df_experiment = df_sales[
    (df_sales['date'] >= datetime(2022, 2, 21)) & 
    (df_sales['date'] < datetime(2022, 2, 28))
]
df_desc_stat = (
    df_experiment
    .groupby('user_id', as_index=False)
    .agg(sum_price = ('price', 'sum'))
    .agg(
        avg = ('sum_price', 'mean'), 
        std = ('sum_price', 'std')
    )
)

In [20]:
df_desc_stat

Unnamed: 0,sum_price
avg,1234.687198
std,811.180339


In [17]:
round(get_sample_size_abs(20, df_desc_stat.sum_price[1], alpha=0.05, beta=0.1), -1)

34570

In [18]:
round(
    get_sample_size_arb(
        df_desc_stat.sum_price[0], 
        df_desc_stat.sum_price[1], 
        eff=1+(20/df_desc_stat.sum_price[0]), 
        alpha=0.05, 
        beta=0.1
    ),
    -1
)

34570

## Task 2

In [26]:
users = df_experiment.user_id.nunique()
round(
    get_minimal_determinable_effect(std=df_desc_stat.sum_price[1], sample_size=users//2, alpha=0.05, beta=0.1)
)

33