# Common used functions in A/B testing

## Calculate the sample size

### Z test

Source: https://carlosgrande.me/sample-size-determination/

The following example assumes that both groups obey the bernoulli distribution.

In [1]:
# Libraries
import noshmishmosh
import pandas as pd
import numpy as np
from scipy import stats
import math
import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt

# Functions
def tolist(tag):
    out = [i[tag] for i in visits]
    return out

In [2]:
visits = noshmishmosh.customer_visits

df_visits = pd.DataFrame({'ids': tolist('id'),
              'name': tolist('name'),
              'clickedthrough': tolist('clickedthrough'),
              'purchased': tolist('purchased'),
              'moneyspent': tolist('moneyspent')
             })


print(df_visits.head())

     ids            name  clickedthrough  purchased  moneyspent
0  83421    Michael Todd            True      False         0.0
1  46042  Brianna Harmon            True      False         0.0
2  23766    Mario Arnold           False      False         0.0
3  20859      Paul Quinn           False      False         0.0
4  57771    Jerome Moore            True      False         0.0


In [3]:
# Test that both populations have the same proportion.
def z_calc(p1, p2, n1, n2):
    p_star = (p1*n1 + p2*n2) / (n1 + n2)
    return (p2 - p1) / math.sqrt(p_star*(1 - p_star)*((1.0 / n1) + (1.0 / n2)))

In [4]:
# Sample calculator
def sample_required(p1, p_diff, alpha):
    n = 1
    while True:
        z = z_calc(p1, p1+p_diff, n1=n, n2=n)
        p = 1 - stats.norm.cdf(z)
        if p < alpha:
            break
        n += 1
    return n

In [5]:
paying_visitors = df_visits[df_visits.purchased == True].ids.count()
print('Number of visitors that purchased: {}'.format(paying_visitors))

total_visitors = df_visits.ids.count()
print('Number of total visitors: {}'.format(total_visitors))

baseline = paying_visitors/total_visitors
print('The baseline is: {} %'.format(baseline*100))

Number of visitors that purchased: 93
Number of total visitors: 500
The baseline is: 18.6 %


In [6]:
revenue = 1240

payments = noshmishmosh.money_spent
print('These are the first 5 payments sample: {}'.format(payments[:5]))

mean_payments = round(np.mean(payments))
print('The average payment is: {} $'.format(mean_payments))

n_payments = np.ceil(revenue/mean_payments)
print('We need {} payments to pull in the revenue'.format(int(n_payments)))

These are the first 5 payments sample: [39.01, 10.16, 36.88, 23.41, 33.49]
The average payment is: 27 $
We need 46 payments to pull in the revenue


In [7]:
lift = n_payments/total_visitors
print('The lift required is: {}%'.format(lift*100))

The lift required is: 9.2%


In [8]:
sample_size = sample_required(baseline, lift, .05)
print('The final sample size is calculated with a baseline of {}% and a lift of {}%.'.format(baseline*100, lift*100), '\n')
print('For this example Nosh Mish Mosh needs to show the new pictures to {} people to make sure there is any improvement'.format(sample_size))

The final sample size is calculated with a baseline of 18.6% and a lift of 9.2%. 

For this example Nosh Mish Mosh needs to show the new pictures to 114 people to make sure there is any improvement


In [11]:
import statsmodels.api as sm
import math

# Define parameters
alpha = 0.05
power = 0.80
effect_size = 0.2  # Small, medium, or large effect size (0.2, 0.5, or 0.8)
std_dev = 1.0  # Estimated standard deviation of your data

# Calculate sample size
sample_size = sm.stats.zt_ind_solve_power(effect_size=effect_size, alpha=alpha, power=power, ratio=1.0)

# Round up to the nearest whole number
sample_size = math.ceil(sample_size)
print(f"Required sample size: {sample_size}")

Required sample size: 393


### T test

In [10]:
import statsmodels.api as sm
import math

# Define parameters
alpha = 0.05
power = 0.80
effect_size = 0.2  # Small, medium, or large effect size (0.2, 0.5, or 0.8)
std_dev = 1.0  # Estimated standard deviation of your data

# Calculate sample size
sample_size = sm.stats.tt_ind_solve_power(effect_size=effect_size, alpha=alpha, power=power, ratio=1.0)

# Round up to the nearest whole number
sample_size = math.ceil(sample_size)
print(f"Required sample size: {sample_size}")

Required sample size: 394


  return np.clip(_boost._nct_sf(x, df, nc), 0, 1)
  return np.clip(_boost._nct_cdf(x, df, nc), 0, 1)


If your A/B test involves different types of data or hypotheses, you may need to use a different statistical test. Here are some common tests for different scenarios:

Two-Sample t-Test: Used for comparing the means of two independent groups (e.g., A/B groups) with continuous data.

Chi-Squared Test: Used for comparing proportions or frequencies in categorical data between two groups.

Mann-Whitney U Test (Wilcoxon Rank-Sum Test): Used when your data is not normally distributed or when you're comparing medians instead of means.

Paired t-Test: Used when you have paired data points, such as before-and-after measurements, and you want to compare the means within the same subjects.

Logistic Regression: Used when your outcome variable is binary (e.g., click-through rates), and you want to model the relationship between the independent variable (A/B group) and the binary outcome.

**effect_size**: standardized effect size, difference between the two means divided by the standard deviation. If ratio=0, then this is the standardized mean in the one sample test.

## Calculate the significance

## Z test

In [1]:
import scipy.stats as stats
import numpy as np
from statsmodels.stats import weightstats as stests

# Simulated data
group1 = np.random.normal(100, 10, 1000)
group2 = np.random.normal(105, 10, 1000)

z_stat, p_value = stests.ztest(group1, group2)
print(f"Z-statistic: {z_stat}, P-value: {p_value}")

Z-statistic: -9.824722993682224, P-value: 8.811594575708955e-23


Z-Test and T-Test

Z or T Statistic: A measure of how many standard deviations an element is from the mean.

P-value: If less than alpha (commonly 0.05), then you reject the null hypothesis. A low p-value (< 0.05) indicates that you can reject the null hypothesis. In other words, a predictor that uses the groups to explain the variable does a better job than a model that does not take into account the groups.

## T test

In [2]:
# Simulated data
group1 = np.random.normal(100, 10, 30)
group2 = np.random.normal(105, 10, 30)

t_stat, p_value = stats.ttest_ind(group1, group2)
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -2.3616920771981875, P-value: 0.021570663729226225


## Chi test

In [3]:
# Simulated data: observed frequencies of some categories for two groups
observed = np.array([[10, 20, 30], [20, 30, 40]])

chi_stat, p_value, dof, expected = stats.chi2_contingency(observed)
print(f"Chi-square statistic: {chi_stat}, P-value: {p_value}, Degrees of freedom: {dof}")
print(f"Expected frequencies: \n{expected}")

Chi-square statistic: 0.7936507936507936, P-value: 0.6724514275370008, Degrees of freedom: 2
Expected frequencies: 
[[12. 20. 28.]
 [18. 30. 42.]]


Chi-Square Test

Chi-Square Statistic: Measures how expected counts and observed counts deviate.

P-value: If less than alpha (commonly 0.05), then the result is significant, and the null hypothesis that the variables are independent is rejected.

Degrees of Freedom: (Number of rows - 1) * (Number of columns - 1)

Expected Frequencies: Values that would be expected if there was no association between the variables.

## Mann-Whitney U test

In [None]:
import scipy.stats as stats
sample_a = df[df['group_info']=='a']['c'].fillna(0)
sample_b = df[df['group_info']=='b']['c'].fillna(0)

# Perform the Mann-Whitney U test
statistic, p_value = stats.mannwhitneyu(sample_a, sample_b, alternative='greater') # less: To test if Sample A is at least larger than Sample B

# Output the results
print(f"U Statistic: {statistic}")
print(f"P-Value: {p_value}")

# Interpret the results
alpha = 0.05  # Set your desired significance level
if p_value < alpha:
    print("Reject the null hypothesis: Sample A is at least larger than Sample B.")
else:
    print("Fail to reject the null hypothesis: No significant enough.")

## Check the distribution of control and test group

In [None]:
# Plot the distribution of bid rate over a/b approaches
fig, ax = plt.subplots(1,2,figsize=(10, 5))
plt.subplot(121)
plot1=plt.hist(df[df['group_info']=='control']['metric'],
edgecolor='k',linewidth=1.0,color='blue')
plt.title("metric_control")

plt.subplot(122)
plot2=plt.hist(df[df['group_info']=='test']['metric'],
edgecolor='k',linewidth=1.0,color='green')
plt.title("metric_test")

plt.show()

In [None]:
import seaborn as sns
sns.displot(df, x = 'metric', hue = 'group_info', kind = 'kde')