In [26]:
import numpy as np 
import pandas as pd

n = 250

Y1 = np.random.normal(loc=10, scale=2, size=n)
Y2 = np.random.normal(loc=14, scale=1.4, size=n)

ids = np.arange(1, n + 1)

df = pd.DataFrame({
    "ID": ids,
    "Y1": Y1,
    "Y2": Y2
})

#Y1 = population mean 10, S.D 2, Var 4
#Y2 = population mean 14, S.D 1.4, Var 1.96

In [27]:
df["diff"] = df["Y2"] - df["Y1"]

#diff s normally distributed based on generated data.
#Global parameters
#E(diff) = E(Y2) - E(Y1) = 14-10 = 4
#Var(diff) = Var(Y2) + Var(Y1) = 1.96+4 = 5.96
#SD(diff) = sqrt(5.96) = 2.44

#However the underlying distribution of Y1, Y2 and diff is unknown (according to problem statement)
#Null Hypotheis: The values for diff came from a normal distribution
#We use Shapiroâ€“Wilk test to test the Null Hypothesis


In [28]:
#Generate order statistics for standard normal distribution
#Any normal distribution can be converted to standard normal distribution
#The same order statistics (the function in this case) can be used for other examples
def generate_order_statistics(n):
    iterations=20000
    m = np.zeros(n)
    for r in range(iterations):
        s = np.random.normal(0, 1, n)
        m += np.sort(s)
    return m / iterations

In [29]:
#Generate covariance matrix for order statistics for standard normal distribution
#Any normal distribution can be converted to standard normal distribution
#The same order statistics (the function in this case) can be used for other examples
def generate_order_covariance(n):
    iterations=20000

    sum_vec = np.zeros(n)
    sum_outer = np.zeros((n, n))

    for r in range(iterations):
        s = np.random.normal(0, 1, n)
        ordered = np.sort(s)

        sum_vec += ordered
        sum_outer += np.outer(ordered, ordered)

    mean_vec = sum_vec / iterations

    covariance_matrix = (sum_outer / iterations) - np.outer(mean_vec, mean_vec)

    return covariance_matrix

In [30]:
def Shapiro_Wilk(vector_sample, m, sigma):

    x = np.asarray(vector_sample)
    x_ordered = np.sort(x)

    x_bar = np.mean(x)

    denominator = np.sum((x - x_bar) ** 2)

    a_num = np.linalg.solve(sigma, m)
    a = a_num / np.sqrt(a_num @ m)

    numerator = (np.sum(a * x_ordered)) ** 2

    W = numerator / denominator

    return W


In [31]:
n = len(df["diff"])
m = generate_order_statistics(n)
sigma = generate_order_covariance(n)

W = Shapiro_Wilk(df["diff"], m, sigma)


In [32]:
from scipy.stats import shapiro
W2, p = shapiro(df["diff"])