# Statistical tests in a nutshell

last updated : 210119
___

- this notebook aims to compare self-made classes  vs package (hypothesis testing)
- typical testing procedure
    1. specify H_0/H_1
    2. built the test statistic
    3. determine the distribution of the test statistic
    4. using 3, calculate the p-value
- *listed tests below are available in the self-made classes)*

    A. Small sample : one sample t/z test, permutation test
    
    B. two sample
    - two sample t/z test
    - paired t-test
    - wilcoxon-mann-whitney
    - wilcoxon signed rank sum


    C. categorical
    - fisher's exact test
    - chi-square
    - mcnemar
    
    
    D. Anova/ ancova
    - f-test
    - anova (one-way/ two-way/ rm / factorial)
    - kruskal wallis
    - Friedman test

ref>>
bootstrap test : https://www.youtube.com/watch?app=desktop&v=9STZ7MxkNVg

## A. Small sample tests

- You may find *One sample* tests and tests for small sample in the **SmallSample** class.

- One sample t/z test
    - if you know the population variance, it will be Z (the default is t-test)
    - in one sample t/z test, you should designate the **specific value** you want to confirm
    - see help(SmallSample) for more information in conducting tests
    
- Permutation test
    - Assumptions for parametric approach are not met
    - Test sth other than classic targets(mean, median)
    - cannot build any confidence interval
    

In [2]:
# exsisting methods
import numpy as np; import pandas as pd

class SmallSample():
    """
    Class SmallSample performs three different tests
        - one sample z-test : you need to specify the population variance
        - one sample t-test 
        - permutation test : for small samples
    
    <Things you should specify>
        df : dataframe
            df[0] = data, df[1] = group(for permutation)
        H0 : speicified value you want to confirm with
        H1 : alternative hypothesis (default = "not equal")
        Types : mean or proportaion (default = "mean")
    """
    def __init__(self, df, h0=0, h1="not equal", types="mean"):
        self.df = df
        self.h0 = h0
        self.h1 = h1
        self.types = types
    
    def ZTest(self, sigma, alpha):
        from scipy import stats; import math
        # assign required
        mu = self.h0
        x = self.df[0].mean()
        n = len(self.df[0])
        # standardisation: test statistic
        Z = (x-mu)/(sigma/math.sqrt(n))
        z0 = stats.norm.ppf(alpha) #stats.zscore(z0)
        # testing
        if self.h1 == "not equal":
            pval = 2*(1-stats.norm.cdf(abs(Z)))
            return Z, pval
            
        elif self.h1 == "less":
            pval = stats.norm.cdf(Z)
            return Z, pval
        
        elif self.h1 == "greater":
            pval = 1-stats.norm.cdf(Z)
            return Z, pval
    
    def TTest(self, alpha):
        from scipy import stats; import math
        # assign required
        mu = self.h0
        x = self.df[0].mean()
        s = self.df[0].std()
        n = len(self.df[0])
        # standardisation: test statistic
        T = (x-mu)/(s/math.sqrt(n))
        t0 = stats.t.ppf(alpha, df=n-1)
        # testing
        if self.h1 == "not equal":
            pval = 2*(1-stats.t.cdf(abs(T), df=n-1))
            return T, pval
            
        elif self.h1 == "less":
            pval = stats.t.cdf(T, df=n-1)
            return T, pval
        
        elif self.h1 == "greater":
            pval = 1-stats.t.cdf(T, df=n-1)
            return T, pval
    
    def PermutationTest(self, group, iters=10, statistic="mean"):
        "for two groups..!, mean"
        import random
        df = self.df
        n = len(df)
        P = abs(df.groupby(data[group]).mean().diff().reset_index()[0][1])
        res = 0
        for i in range(iters):
            perm = df[0].copy()
            random.shuffle(perm)
            P0 = abs(perm[:(n//2)].mean()-perm[(n//2)+1:].mean())
            if P > P0:
                res +=1
            continue
        return res/n

#### examples A 
- existing package, own codes

In [3]:
# data generating from normal distribution 
x = np.random.normal(loc=0,scale=1,size=100)
data = pd.DataFrame(x)
data.head()

Unnamed: 0,0
0,-0.690604
1,-1.028163
2,0.93987
3,0.814921
4,0.773056


In [4]:
# Z test
dan = SmallSample(df=data, h0=3, h1="greater")
dan.ZTest(alpha=0.05, sigma=1)

(-30.953299878580832, 1.0)

In [5]:
# T test
dan = SmallSample(df=data, h0=0, h1="greater")
dan.TTest(alpha=0.05)

(-0.9850354299270078, 0.8364960274647698)

In [None]:
# T test package
from scipy.stats import ttest_1samp
ttest_1samp(data[0], 0)

In [6]:
# permutation test
import numpy as np
data = pd.DataFrame(np.random.randint(1,100, size=30))
data.loc[:15,'group'] = 'A';data.loc[16:,'group'] = 'B'
dan = SmallSample(df=data)
dan.PermutationTest(group='group',iters=10)

0.2

In [None]:
from mlxtend.evaluate import permutation_test
permutation_test(data.loc[data['group']=='A',0], data.loc[data['group']=='B',0],method='approximate',num_rounds=100,seed=0)

## B. Two samples tests

- Two sample t/z test
    - if you know the population variance, it will be Z (the default is t-test)
    - in one sample t/z test, you should designate the **specific value** you want to confirm
    - see help(SmallSample) for more information in conducting tests
    
- paired t-test
- wilcoxon-mann-whitney
- wilcoxon signed rank sum
    

In [2]:
# exsisting methods
import numpy as np; import pandas as pd

class SmallSample():
    """
    Class SmallSample performs four different tests
    - two sample z-test (default : same variance)
    - two sample t-test (default : same variance)
    - wilcoxon-mann-whitney test 
    - wilcoxon signed rank sum test
    
    <Things you should specify>
        df : dataframe
            df[0] = data, df[1] = group(for permutation)
        H0 : speicified value you want to confirm with
        H1 : alternative hypothesis (default = "not equal")
        Types : mean or proportaion (default = "mean")
    """
    def __init__(self, df, h0=0, h1="not equal", types="mean"):
        self.df = df
        self.h0 = h0
        self.h1 = h1
        self.types = types
    
    def ZTest(self, sigma, alpha):
        from scipy import stats; import math
        # assign required
        mu = self.h0
        x = self.df[0].mean()
        n = len(self.df[0])
        # standardisation: test statistic
        Z = (x-mu)/(sigma/math.sqrt(n))
        z0 = stats.norm.ppf(alpha) #stats.zscore(z0)
        # testing
        if self.h1 == "not equal":
            pval = 2*(1-stats.norm.cdf(abs(Z)))
            return Z, pval
            
        elif self.h1 == "less":
            pval = stats.norm.cdf(Z)
            return Z, pval
        
        elif self.h1 == "greater":
            pval = 1-stats.norm.cdf(Z)
            return Z, pval
    
    def TTest(self, alpha):
        from scipy import stats; import math
        # assign required
        mu = self.h0
        x = self.df[0].mean()
        s = self.df[0].std()
        n = len(self.df[0])
        # standardisation: test statistic
        T = (x-mu)/(s/math.sqrt(n))
        t0 = stats.t.ppf(alpha, df=n-1)
        # testing
        if self.h1 == "not equal":
            pval = 2*(1-stats.t.cdf(abs(T), df=n-1))
            return T, pval
            
        elif self.h1 == "less":
            pval = stats.t.cdf(T, df=n-1)
            return T, pval
        
        elif self.h1 == "greater":
            pval = 1-stats.t.cdf(T, df=n-1)
            return T, pval
    
    def PermutationTest(self, group, iters=10, statistic="mean"):
        "for two groups..!, mean"
        import random
        df = self.df
        n = len(df)
        P = abs(df.groupby(data[group]).mean().diff().reset_index()[0][1])
        res = 0
        for i in range(iters):
            perm = df[0].copy()
            random.shuffle(perm)
            P0 = abs(perm[:(n//2)].mean()-perm[(n//2)+1:].mean())
            if P > P0:
                res +=1
            continue
        return res/n

#### examples B 
- existing package, own codes