In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from sklearn.preprocessing import StandardScaler
import scipy.stats as st

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# tested lib
from ab_test.user import User
from ab_test.experiment import *
from ab_test.hasher_implems import *

from tools.chi_squared import ChiSquaredTest as chi

import time

# Chi Squared Test

We consider experiments with 2 variations of 50% each.

## Definitions

### Hypothesis

* Experiment A has _r = 2_ levels (variation 1 and variation 2)
* Experiment B has _c = 2_ levels (variation 1 and variation 2)

The null hypothesis states that knowing the variation in experiment A does not help you predict the variation in experiment B.

Ho: Variable A and Variable B are independent.

Ha: Variable A and Variable B are not independent.

### Degrees of Freedom

`DF = (r - 1) * (c - 1) = 1`

### Expected Frequencies

The expected frequency counts are computed separately for each level of one categorical variable at each level of the other categorical variable. Compute r * c expected frequencies, according to the following formula.
`Er,c = (nr * nc) / n`

where Er,c is the expected frequency count for level r of Variable A and level c of Variable B, nr is the total number of sample observations at level r of Variable A, nc is the total number of sample observations at level c of Variable B, and n is the total sample size.

For example, for a sample of 100 users, in a perfect case, we would have:

`Er,c = (50*50) / 100 = 25`

### Test Statistic

The test statistic is a chi-square random variable (Χ2) defined by the following equation.
`Χ2 = Σ [ (Or,c - Er,c)2 / Er,c ]`

where Or,c is the observed frequency count at level r of Variable A and level c of Variable B, and Er,c is the expected frequency count at level r of Variable A and level c of Variable B.


### P-value

The P-value is the probability of observing a sample statistic as extreme as the test statistic.

### Conclude

* If observed chi-square < critical chi-square, then variables are not related.
* If observed chi-square > critical chi-square, then variables are not independent (and hence may be related).

For DF=1 and a precision of 5% (α=0.05), the critical chi-square is 3.841.

## Tools

### Tool methods for data generation

In [2]:
def gen_data(hasher: Hasher, population, *args):
    
    header = ["/", "ExpA:Var1", "ExpA:Var2", "Sum"]
    data = []
    if (args):
        i = args[0]
        expA = Experiment(hasher, f'{2*i:06d}')
        expB = Experiment(hasher, f'{2*i + 1:06d}')
    else:
        expA = Experiment(hasher)
        expB = Experiment(hasher)
    
    varA1B1 = 0
    varA1B2 = 0
    varA2B1 = 0
    varA2B2 = 0
    
    for i in range(1, population+1):
        user = User(i)
        varA = expA.assign(user)
        varB = expB.assign(user)
        
        if varA.name == "variation1":
            if varB.name == "variation1":
                varA1B1 += 1
            else:
                varA1B2 += 1
        else:
            if varB.name == "variation1":
                varA2B1 += 1
            else:
                varA2B2 += 1
        
    df = pd.DataFrame(np.array([["ExpB:Var1", varA1B1,           varA2B1,       varA1B1+varA2B1],
                                ["ExpB:Var2", varA1B2,           varA2B2,       varA1B2+varA2B2],
                                ["Sum",       varA1B1+varA1B2, varA2B1+varA2B2, varA1B1+varA2B1+varA1B2+varA2B2]]),
                      columns=header)
    
    df = pd.concat([df["/"],
                    pd.to_numeric(df["ExpA:Var1"]),
                    pd.to_numeric(df["ExpA:Var2"]),
                    pd.to_numeric(df["Sum"])],
                    axis = 1)
    
    matrix = [[varA1B1,         varA2B1,         varA1B1+varA2B1],
              [varA1B2,         varA2B2,         varA1B2+varA2B2],
              [varA1B1+varA1B2, varA2B1+varA2B2, varA1B1+varA2B1+varA1B2+varA2B2]]
    return df, matrix

### Test definitions

In [3]:
population = 1000
nb_exp = 1000

## Built-In Hash Method

### Data Generation

In [4]:
# Built-In Hash

df_bi, matrix_bi = gen_data(BuiltInHasher(), population)

df_bi

Unnamed: 0,/,ExpA:Var1,ExpA:Var2,Sum
0,ExpB:Var1,254,235,489
1,ExpB:Var2,257,254,511
2,Sum,511,489,1000


### Test Statistic

In [5]:
chi.statistic_test(matrix_bi)

0.27198547422482067

### Bigger scale test

In [6]:
begin = time.time()
stat_tests_bi = []

for i in range(0, nb_exp):
  df_bi, matrix_bi = gen_data(BuiltInHasher(), population, i)
  stat_tests_bi.append(chi.statistic_test(matrix_bi))
print(f"{time.time() - begin}s")


5.971721172332764s


In [7]:
print(f"median: {np.median(stat_tests_bi)}")
print(f"mean: {np.mean(stat_tests_bi)}")
print(f"std deviation: {np.std(stat_tests_bi)}")

median: 0.47619918916455284
mean: 1.0504442583207723
std deviation: 1.5013461019462877


### Conclusion

We can accept the null hypothesis (assignations of experiments A and B are independant) if the median above is inferior to the critical value 3.84.

## MD5

### Data Generation

In [8]:
df_md5, matrix_md5 = gen_data(Md5Hasher(), population)

df_md5

Unnamed: 0,/,ExpA:Var1,ExpA:Var2,Sum
0,ExpB:Var1,239,237,476
1,ExpB:Var2,277,247,524
2,Sum,516,484,1000


### Chi Squared Test

In [9]:
chi.statistic_test(matrix_md5)

0.7026801577344564

### Bigger scale test

In [10]:
begin = time.time()
stat_tests_md5 = []

for i in range(0, nb_exp):
  df_md5, matrix_md5 = gen_data(Md5Hasher(), population)
  stat_tests_md5.append(chi.statistic_test(matrix_md5))
print(f"{time.time() - begin}s")


12.946851968765259s


In [11]:
print(f"median: {np.median(stat_tests_md5)}")
print(f"mean: {np.mean(stat_tests_md5)}")
print(f"std deviation: {np.std(stat_tests_md5)}")

median: 0.4539387341245711
mean: 1.0146829342559969
std deviation: 1.4785819904031852


### Conclusion

We can accept the null hypothesis (assignations of experiments A and B are independant) if the median above is inferior to the critical value 3.84.

## Sha256

### Data Generation

In [12]:
df_sha, matrix_sha = gen_data(Sha256Hasher(), population)

df_sha

Unnamed: 0,/,ExpA:Var1,ExpA:Var2,Sum
0,ExpB:Var1,253,249,502
1,ExpB:Var2,247,251,498
2,Sum,500,500,1000


### Chi Squared Test

In [13]:
chi.statistic_test(matrix_sha)

0.06400102401638426

### Bigger scale test

In [14]:
begin = time.time()
stat_tests_sha = []

for i in range(0, nb_exp):
  df_sha, matrix_sha = gen_data(Sha256Hasher(), population)
  stat_tests_sha.append(chi.statistic_test(matrix_sha))
print(f"{time.time() - begin}s")


13.351825952529907s


In [15]:
print(f"median: {np.median(stat_tests_sha)}")
print(f"mean: {np.mean(stat_tests_sha)}")
print(f"std deviation: {np.std(stat_tests_sha)}")

median: 0.48530934610776244
mean: 1.0353418743915541
std deviation: 1.3881691834500394


### Conclusion

We can accept the null hypothesis (assignations of experiments A and B are independant) if the median above is inferior to the critical value 3.84.