### Повторим АА-тесты, но теперь посчитаем ошибку первого рода по комбинации, сравнивая среднее p_value с заданным порогом

In [1]:
import numpy as np
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt

from scipy.stats import ttest_ind_from_stats
from tqdm.notebook import tqdm
from itertools import combinations

%matplotlib inline

In [3]:
metric_combination = ['metric_0', 'metric_1', 'metric_2', 'metric_3', 'metric_4']
mean_pvalue_th = 0.2905144403455091

In [4]:
# alpha
alpha = 0.05
# number of observations (users/devices/etc)
N = 10000
# number of metrics
k = 100
# number of experiments
r = 10000
# extra variables
metric_names = [f'metric_{i}' for i in range(k)]
aa_names = [f'aa_{i}' for i in range(r)]
# generate metrics
metrics = np.array([np.random.lognormal(mean=2, sigma=1.2, size=N) for _ in range(k)])
metrics_df = pd.DataFrame(metrics.T, 
                          columns=metric_names)
metrics_df['idx'] = range(N)
# generate experiments
experiments = np.array([np.random.uniform(size=N)<0.5 for _ in range(r)])
experiments_df = pd.DataFrame(experiments.T, 
                              columns=aa_names).astype('int8')
total_df = pd.concat([metrics_df, experiments_df], axis=1)
total_df.head(3)

Unnamed: 0,metric_0,metric_1,metric_2,metric_3,metric_4,metric_5,metric_6,metric_7,metric_8,metric_9,...,aa_9990,aa_9991,aa_9992,aa_9993,aa_9994,aa_9995,aa_9996,aa_9997,aa_9998,aa_9999
0,7.267821,15.349525,1.303401,2.430369,2.14902,19.634085,4.45047,5.481491,87.640495,14.089879,...,0,1,1,0,0,1,1,1,0,1
1,18.271534,16.072554,26.309688,30.084593,14.367928,30.213338,8.216269,1.753229,13.491277,12.850651,...,1,1,1,1,0,0,1,0,1,1
2,5.249589,1.160538,16.357229,16.397229,1.720516,6.597463,23.684425,7.38651,1.596335,15.50368,...,1,0,0,0,1,1,1,1,1,1


In [5]:
%%time
final_dataset = []
for exp_name in tqdm(aa_names):
    means = total_df.groupby([exp_name])[[f'metric_{i}' for i in range(k)]].mean().to_dict()
    stds = total_df.groupby([exp_name])[[f'metric_{i}' for i in range(k)]].std().to_dict()
    metrics_pvalues = []
    for metric in metric_names:
        mean1, mean2 = means[metric][0], means[metric][1]
        std1, std2 = stds[metric][0], stds[metric][1]
        p_value = ttest_ind_from_stats(mean1=mean1, 
                                       std1=std1, 
                                       nobs1=N//2, 
                                       mean2=mean2, 
                                       std2=std2, 
                                       nobs2=N//2).pvalue
        metrics_pvalues.append(p_value)
    final_dataset.append(metrics_pvalues)

  0%|          | 0/10000 [00:00<?, ?it/s]

Wall time: 6min 20s


In [6]:
pvalues_dataset = pd.DataFrame(final_dataset, columns=metric_names, index=aa_names)
pvalues_dataset.head(3)

Unnamed: 0,metric_0,metric_1,metric_2,metric_3,metric_4,metric_5,metric_6,metric_7,metric_8,metric_9,...,metric_90,metric_91,metric_92,metric_93,metric_94,metric_95,metric_96,metric_97,metric_98,metric_99
aa_0,0.614274,0.556912,0.715889,0.867129,0.77613,0.82237,0.56792,0.836978,0.198598,0.766151,...,0.010625,0.395209,0.515301,0.686506,0.912318,0.574777,0.711903,0.447873,0.357999,0.227344
aa_1,0.453994,0.870819,0.94236,0.548963,0.86535,0.232012,0.996324,0.232398,0.121449,0.213824,...,0.653838,0.124485,0.19946,0.468083,0.590155,0.89038,0.444742,0.563984,0.169438,0.208511
aa_2,0.289379,0.666592,0.343445,0.189515,0.216525,0.234113,0.53966,0.336703,0.402156,0.21866,...,0.294007,0.558585,0.810261,0.579004,0.614187,0.563574,0.276023,0.432687,0.751909,0.25714


In [7]:
mean_pvalues = pvalues_dataset[metric_combination].mean(axis=1).values

In [9]:
np.mean(mean_pvalues<mean_pvalue_th)

0.0527

Ошибка первого рода приблизительно равна теоретическому значению