In [1]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint

In [4]:
data = pd.read_csv("data/banner_click_stat.txt", header=None, sep="\t", names=["banner_a", "banner_b"])
data.head()

Unnamed: 0,banner_a,banner_b
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0


In [5]:
data.describe()

Unnamed: 0,banner_a,banner_b
count,1000.0,1000.0
mean,0.037,0.053
std,0.188856,0.224146
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,1.0


### Интервальные оценки долей

In [6]:
conf_int_banner_a = proportion_confint(sum(data.banner_a), data.shape[0], method="normal")
conf_int_banner_b = proportion_confint(sum(data.banner_b), data.shape[0], method="normal")

In [7]:
print(conf_int_banner_a, conf_int_banner_b)

(0.02530064022092865, 0.04869935977907135) (0.03911451622486782, 0.06688548377513218)


### Z-критерий (независимые выборки)

In [8]:
def prop_diff_confint_ind(sample1, sample2, alpha=.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2)
    p1 = sum(sample1) / len(sample1)
    p2 = sum(sample2) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1) / len(sample1) + p2 * (1 - p2) / len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1) / len(sample1) + p2 * (1 - p2) / len(sample2))
    return left_boundary, right_boundary

In [9]:
def prop_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = sum(sample1) / n1
    p2 = sum(sample2) / n2
    P = (p1 * n1 + p2 * n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1 / n1 + 1 / n2))

In [24]:
def prop_diff_z_test(z_stat, alternative="two-sided"):
    if alternative not in ("two-sided", "less", "greater"):
        raise ValueError("alternative not recognized\nshould be 'two-sided', 'less' or 'greater'")
    if alternative == "two-sided":
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    if alternative == "less":
        return scipy.stats.norm.cdf(z_stat)
    if alternative == "greater":
        return 1 - scipy.stats.norm.cdf(z_stat)

In [16]:
prop_diff_confint_ind(data.banner_a, data.banner_b)

(-0.0341571385110543, 0.002157138511054299)

In [17]:
Z = prop_diff_z_stat_ind(data.banner_a, data.banner_b)
Z

-1.7258261378415294

In [23]:
prop_diff_z_test(Z)

0.08437869601106662

In [27]:
prop_diff_z_test(Z, "less")

0.042189348005533284

### Z-критерий связанные выборки

In [42]:
def prop_diff_confint_rel(sample1, sample2, alpha=.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2)
    sample = list(zip(sample1, sample2))
    n = len(sample)
    f = sum(1 if x[0] == 1 and x[1] == 0 else 0 for x in sample)
    g = sum(1 if x[1] == 1 and x[0] == 0 else 0 for x in sample)
    
    left_boundary = (f - g) / n - z * np.sqrt((f + g) / n ** 2 - (f - g) ** 2 / n ** 3)
    right_boundary = (f - g) / n + z * np.sqrt((f + g) / n ** 2 - (f - g) ** 2 / n ** 3)
    return left_boundary, right_boundary

In [45]:
def prop_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    f = sum(1 if x[0] == 1 and x[1] == 0 else 0 for x in sample)
    g = sum(1 if x[1] == 1 and x[0] == 0 else 0 for x in sample)
    return (f - g) / np.sqrt(f + g - (f - g) ** 2 / n)

In [44]:
prop_diff_confint_rel(data.banner_a, data.banner_b)

(-0.02668926335921826, -0.00531073664078174)

In [48]:
Z2 = prop_diff_z_stat_rel(data.banner_a, data.banner_b)
Z2

-2.9337310438325916

In [50]:
prop_diff_z_test(Z2, "less")

0.0016745714232725471

In [51]:
prop_diff_z_test(Z2)

0.0033491428465450834