In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from sklearn.preprocessing import StandardScaler
import scipy.stats as st

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# tested lib
from user import User
from experiment import *
from hasher_implems import *

# Chi Squared Test

We consider experiments with 2 variations of 50% each.

## Hypothesis

* Experiment A has _r = 2_ levels (variation 1 and variation 2)
* Experiment B has _c = 2_ levels (variation 1 and variation 2)

The null hypothesis states that knowing the variation in experiment A does not help you predict the variation in experiment B.

Ho: Variable A and Variable B are independent.

Ha: Variable A and Variable B are not independent.

## Degrees of Freedom

`DF = (r - 1) * (c - 1) = 1`

## Expected Frequencies

The expected frequency counts are computed separately for each level of one categorical variable at each level of the other categorical variable. Compute r * c expected frequencies, according to the following formula.
`Er,c = (nr * nc) / n`

where Er,c is the expected frequency count for level r of Variable A and level c of Variable B, nr is the total number of sample observations at level r of Variable A, nc is the total number of sample observations at level c of Variable B, and n is the total sample size.

For example, for a sample of 100 users, in a per we would have:

`Er,c = (50*50) / 100 = 25`

## Test Statistic

The test statistic is a chi-square random variable (Χ2) defined by the following equation.
`Χ2 = Σ [ (Or,c - Er,c)2 / Er,c ]`

where Or,c is the observed frequency count at level r of Variable A and level c of Variable B, and Er,c is the expected frequency count at level r of Variable A and level c of Variable B.


## P-value

The P-value is the probability of observing a sample statistic as extreme as the test statistic.

In [2]:
# Tool methods for chi squared test

def expected(i, j):
    return (matrix_bi[i][2])*(matrix_bi[2][j])/matrix_bi[2][2]

def step(i, j):
    exp = expected(i, j)
    case = matrix_bi[i][j]
    print(f"step {i},{j}: case={case}, exp={exp}")
    return (case-exp) * (case-exp)/exp

def statistic_test(df, population: int):
    result = 0
    for i in range(0, 2):
        for j in range(0, 2):
            result += step(i, j)
    return result

In [3]:
# Generation of data

def gen_data(hasher: Hasher, population):
    
    header = ["/", "ExpA:Var1", "ExpA:Var2", "Sum"]
    data = []
    
    expA = Experiment(hasher)
    expB = Experiment(hasher)
    
    varA1B1 = 0
    varA1B2 = 0
    varA2B1 = 0
    varA2B2 = 0
    
    for i in range(1, population+1):
        user = User(i)
        varA = expA.assign(user)
        varB = expB.assign(user)
        
        if varA.name == "variation1":
            if varB.name == "variation1":
                varA1B1 += 1
            else:
                varA1B2 += 1
        else:
            if varB.name == "variation1":
                varA2B1 += 1
            else:
                varA2B2 += 1
        
    df = pd.DataFrame(np.array([["ExpB:Var1", varA1B1,           varA2B1,       varA1B1+varA2B1],
                                ["ExpB:Var2", varA1B2,           varA2B2,       varA1B2+varA2B2],
                                ["Sum",       varA1B1+varA1B2, varA2B1+varA2B2, varA1B1+varA2B1+varA1B2+varA2B2]]),
                      columns=header)
    
    df = pd.concat([df["/"],
                    pd.to_numeric(df["ExpA:Var1"]),
                    pd.to_numeric(df["ExpA:Var2"]),
                    pd.to_numeric(df["Sum"])],
                    axis = 1)
    
    matrix = [[varA1B1,         varA2B1,         varA1B1+varA2B1],
              [varA1B2,         varA2B2,         varA1B2+varA2B2],
              [varA1B1+varA1B2, varA2B1+varA2B2, varA1B1+varA2B1+varA1B2+varA2B2]]
    return df, matrix

## Built-In Hash Method

### Data Generation

In [4]:
# Built-In Hash

df_bi, matrix_bi = gen_data(BuiltInHasher(), 1000)

df_bi

Unnamed: 0,/,ExpA:Var1,ExpA:Var2,Sum
0,ExpB:Var1,252,251,503
1,ExpB:Var2,265,232,497
2,Sum,517,483,1000


### Test Statistic

In [5]:
statistic_test(df_bi, 1000)

step 0,0: case=252, exp=260.051
step 0,1: case=251, exp=242.949
step 1,0: case=265, exp=256.949
step 1,1: case=232, exp=240.051


1.0383352684286034