In [1]:
import numpy as np
from scipy.stats import norm
from statsmodels.stats.weightstats import ztest

In [2]:
np.random.seed(0)

### One sample test

In [3]:
# Generating observations of sample size 100 from a normal distribution 
# with mean mu and SD sigma 
N = 100
mu = 0.2
sigma = 1
x = np.random.randn(N)*sigma + mu

In [4]:
# two-sided test using inbuilt function
# returns test statistic and p value
ztest(x)

(2.5648404153513686, 0.01032232684881584)

In [5]:
# two-sided test
mu_hat = x.mean() # sample mean
sigma_hat = x.std(ddof = 1) # sample STD
z = mu_hat / (sigma_hat / np.sqrt(N)) # our mu0 = 0
p_right = 1 - norm.cdf(np.abs(z)) # area to the right of test statistic
p_left = norm.cdf(-np.abs(z))
p = p_right + p_left
z, p

(2.564840415351368, 0.010322326848815901)

In [6]:
# one sided test
ztest(x, alternative = "larger")

(2.5648404153513686, 0.00516116342440792)

In [7]:
# one-sided test
mu_hat = x.mean() # sample mean
sigma_hat = x.std(ddof = 1) # sample STD
z = mu_hat / (sigma_hat / np.sqrt(N)) # our mu0 = 0
p = 1 - norm.cdf(z) # area to the right of test statistic
z, p

(2.564840415351368, 0.005161163424407977)

In [8]:
# null under a different reference value
mu0 = 0.2
ztest(x, value = mu0)

(0.5904283402851699, 0.5549035151647227)

In [9]:
# null under a diferent reference value
mu_hat = x.mean() # sample mean
sigma_hat = x.std(ddof = 1) # sample STD
z = (mu_hat - mu0) / (sigma_hat / np.sqrt(N)) # our mu0 = 0
p_right = 1 - norm.cdf(np.abs(z)) # area to the right of test statistic
p_left = norm.cdf(-np.abs(z))
p = p_right + p_left
z, p

(0.5904283402851698, 0.5549035151647228)

### Two sample test
Test statistic : $\frac{(\bar{x_1} - \bar{x_2}) - (\mu_1 - \mu_2)}{SE_{pool}}$

In [10]:
N0 = 100 # sample size
mu0 = 0.2 # population mean of group 1
sigma0 = 1 # population SD of group 1
x0 = np.random.randn(N)*sigma0 + mu0

N1 = 100 # sample size
mu1 = 0.5 # population mean of group 2
sigma1 = 1 # population SD of group 2
x1 = np.random.randn(N)*sigma1 + mu1

In [11]:
ztest(x0,x1)

(-1.1234612344369315, 0.2612416557056353)

Here, p value is greater than alpha. Thus, we fail to reject the null hypothesis although we can see a difference between the means of the groups.

In [13]:
# two sample test implementation
mu_hat0 = x0.mean() # sample mean of group 1
mu_hat1 = x1.mean() # sample mean of group 2
dmu_hat = mu_hat1 - mu_hat0 # difference between sample means
s2_hat0 = x0.var(ddof = 1)
s2_hat1 = x1.var(ddof = 1)
s_hat = np.sqrt(s2_hat0/ N0 + s2_hat1/ N1)# pooled variance
z = dmu_hat / s_hat 
p_right = 1 - norm.cdf(np.abs(z))
p_left = norm.cdf(-np.abs(z))
p = p_right + p_left
z, p

(1.1234612344369315, 0.26124165570563523)

Here the sign of z statistic is different from the one returned by the ztest, this is because it depends on how we find the difference.

In [15]:
# show tha we will reject the null hypothesis when it is true
# 5 % of the time
num_tests = 10000
results = np.zeros(num_tests)
for i in range(num_tests):
    x1 = np.random.randn(100)
    x2 = np.random.randn(200)
    x, p = ztest(x1, x2)
    results[i] = (p<0.05)
print(results.mean())

0.0459
