In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from sklearn.preprocessing import StandardScaler
import scipy.stats as st

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# tested lib
from user import User
from experiment import *
from hasher_implems import *

import time

# Chi Squared Test

We consider experiments with 2 variations of 50% each.

## Definitions

### Hypothesis

* Experiment A has _r = 2_ levels (variation 1 and variation 2)
* Experiment B has _c = 2_ levels (variation 1 and variation 2)

The null hypothesis states that knowing the variation in experiment A does not help you predict the variation in experiment B.

Ho: Variable A and Variable B are independent.

Ha: Variable A and Variable B are not independent.

### Degrees of Freedom

`DF = (r - 1) * (c - 1) = 1`

### Expected Frequencies

The expected frequency counts are computed separately for each level of one categorical variable at each level of the other categorical variable. Compute r * c expected frequencies, according to the following formula.
`Er,c = (nr * nc) / n`

where Er,c is the expected frequency count for level r of Variable A and level c of Variable B, nr is the total number of sample observations at level r of Variable A, nc is the total number of sample observations at level c of Variable B, and n is the total sample size.

For example, for a sample of 100 users, in a perfect case, we would have:

`Er,c = (50*50) / 100 = 25`

### Test Statistic

The test statistic is a chi-square random variable (Χ2) defined by the following equation.
`Χ2 = Σ [ (Or,c - Er,c)2 / Er,c ]`

where Or,c is the observed frequency count at level r of Variable A and level c of Variable B, and Er,c is the expected frequency count at level r of Variable A and level c of Variable B.


### P-value

The P-value is the probability of observing a sample statistic as extreme as the test statistic.

### Conclude

* If observed chi-square < critical chi-square, then variables are not related.
* If observed chi-square > critical chi-square, then variables are not independent (and hence may be related).

For DF=1 and a precision of 5% (α=0.05), the critical chi-square is 3.841.

## Tools

### Tool methods for chi squared test

In [2]:
def expected(matrix, i, j):
    return (matrix_bi[i][2])*(matrix_bi[2][j])/matrix_bi[2][2]

def step(matrix, i, j):
    exp = expected(matrix, i, j)
    case = matrix[i][j]
    #print(f"step {i},{j}: case={case}, exp={exp}")
    return (case-exp) * (case-exp)/exp

def statistic_test(matrix):
    result = 0
    for i in range(0, 2):
        for j in range(0, 2):
            result += step(matrix, i, j)
    return result

### Tool methods for data generation

In [3]:
def gen_data(hasher: Hasher, population, *args):
    
    header = ["/", "ExpA:Var1", "ExpA:Var2", "Sum"]
    data = []
    if (args):
        i = args[0]
        expA = Experiment(hasher, 2*i)
        expB = Experiment(hasher, 2*i +1)
    else:
        expA = Experiment(hasher)
        expB = Experiment(hasher)
    
    varA1B1 = 0
    varA1B2 = 0
    varA2B1 = 0
    varA2B2 = 0
    
    for i in range(1, population+1):
        user = User(i)
        varA = expA.assign(user)
        varB = expB.assign(user)
        
        if varA.name == "variation1":
            if varB.name == "variation1":
                varA1B1 += 1
            else:
                varA1B2 += 1
        else:
            if varB.name == "variation1":
                varA2B1 += 1
            else:
                varA2B2 += 1
        
    df = pd.DataFrame(np.array([["ExpB:Var1", varA1B1,           varA2B1,       varA1B1+varA2B1],
                                ["ExpB:Var2", varA1B2,           varA2B2,       varA1B2+varA2B2],
                                ["Sum",       varA1B1+varA1B2, varA2B1+varA2B2, varA1B1+varA2B1+varA1B2+varA2B2]]),
                      columns=header)
    
    df = pd.concat([df["/"],
                    pd.to_numeric(df["ExpA:Var1"]),
                    pd.to_numeric(df["ExpA:Var2"]),
                    pd.to_numeric(df["Sum"])],
                    axis = 1)
    
    matrix = [[varA1B1,         varA2B1,         varA1B1+varA2B1],
              [varA1B2,         varA2B2,         varA1B2+varA2B2],
              [varA1B1+varA1B2, varA2B1+varA2B2, varA1B1+varA2B1+varA1B2+varA2B2]]
    return df, matrix

### Test definitions

In [4]:
population = 1000
nb_exp = 1000

## Built-In Hash Method

### Data Generation

In [5]:
# Built-In Hash

df_bi, matrix_bi = gen_data(BuiltInHasher(), population)

df_bi

Unnamed: 0,/,ExpA:Var1,ExpA:Var2,Sum
0,ExpB:Var1,260,264,524
1,ExpB:Var2,223,253,476
2,Sum,483,517,1000


### Test Statistic

In [6]:
statistic_test(matrix_bi)

0.7661763535307267

### Bigger scale test

In [7]:
begin = time.time()
stat_tests_bi = []

for i in range(0, nb_exp):
  df_bi, matrix_bi = gen_data(BuiltInHasher(), population, i)
  stat_tests_bi.append(statistic_test(matrix_bi))
print(f"{time.time() - begin}s")


5.317546844482422s


In [8]:
print(f"median: {np.median(stat_tests_bi)}")
print(f"mean: {np.mean(stat_tests_bi)}")
print(f"std deviation: {np.std(stat_tests_bi)}")

median: 0.47808088521624836
mean: 1.0708327218981721
std deviation: 1.5297446361379918


### Conclusion

We can accept the null hypothesis (assignations of experiments A and B are independant) if the median above is inferior to the critical value 3.84.

## MD5

### Data Generation

In [9]:
df_md5, matrix_md5 = gen_data(Md5Hasher(), population)

df_md5

Unnamed: 0,/,ExpA:Var1,ExpA:Var2,Sum
0,ExpB:Var1,240,268,508
1,ExpB:Var2,249,243,492
2,Sum,489,511,1000


### Chi Squared Test

In [10]:
statistic_test(matrix_md5)

1.7687653208839074

### Bigger scale test

In [11]:
begin = time.time()
stat_tests_md5 = []

for i in range(0, nb_exp):
  df_md5, matrix_md5 = gen_data(Md5Hasher(), population)
  stat_tests_md5.append(statistic_test(matrix_md5))
print(f"{time.time() - begin}s")


12.799982786178589s


In [12]:
print(f"median: {np.median(stat_tests_md5)}")
print(f"mean: {np.mean(stat_tests_md5)}")
print(f"std deviation: {np.std(stat_tests_md5)}")

median: 3.600366111682077
mean: 4.420452708895606
std deviation: 3.4571636860796544


### Conclusion

We can accept the null hypothesis (assignations of experiments A and B are independant) if the median above is inferior to the critical value 3.84.

## Sha256

### Data Generation

In [13]:
df_sha, matrix_sha = gen_data(Sha256Hasher(), population)

df_sha

Unnamed: 0,/,ExpA:Var1,ExpA:Var2,Sum
0,ExpB:Var1,256,271,527
1,ExpB:Var2,223,250,473
2,Sum,479,521,1000


### Chi Squared Test

In [14]:
statistic_test(matrix_sha)

1.8062189193295999

### Bigger scale test

In [15]:
begin = time.time()
stat_tests_sha = []

for i in range(0, nb_exp):
  df_sha, matrix_sha = gen_data(Sha256Hasher(), population)
  stat_tests_sha.append(statistic_test(matrix_sha))
print(f"{time.time() - begin}s")


15.060874938964844s


In [16]:
print(f"median: {np.median(stat_tests_sha)}")
print(f"mean: {np.mean(stat_tests_sha)}")
print(f"std deviation: {np.std(stat_tests_sha)}")

median: 3.370178456464969
mean: 4.321348647773991
std deviation: 3.394073204236882


### Conclusion

We can accept the null hypothesis (assignations of experiments A and B are independant) if the median above is inferior to the critical value 3.84.